2015-01-18

I have a set of files that I copy between my MacBook and a flash drive on a daily basis. The flash drive is formatted as FAT32. This allows me to work with the same set of files on a Windows machine during the day and my Mac at night. It also functions as a twice-daily backup.

I used to use /usr/bin/rsync to perform the copies, but I never found the right command line to automatically, correctly sync in both directions. At one point, I used a BASH script to examine the times of a 'sentinel' file and rsync in the appropriate direction. This worked okay except for those times when I failed to touch the sentinal file and the next sync destroyed part of my work. After that happened a few times, I decided to write a Python script to handle the sync. I am also keeping more files in my sync directories for backup purposes now, so I need a more reliable script.

mydircmp() recursively walks both the local and the remote directories. It collects the paths of files that exist on one side but not the other. It collects the paths of files that exist on both sides, but with different times. It also looks for discrepancies such as files with the same timestamp having different sizes, or a path being a file on one side and a directory on the other.

#!/usr/bin/env python

import sys, os, os.path, shutil
import re, subprocess

# Change this default:
DEFAULT_REMOTE_DIR = '/Volumes/someDisk/someDirectory'

EXEBITS = 0o755
OVERRIDES = {}
flagDoCopy = 0

def mydircmp(LOCAL,REMOTE):
	local_list = os.listdir(LOCAL)
	remote_list = os.listdir(REMOTE)
	local_list.sort()
	remote_list.sort()

	z = {
		'local_only' : [],
		'remote_only' : [],
		'local_to_remote' : [],
		'remote_to_local' : [],
		'failure_type' : [],
		'failure_size' : []
	}
	local_only = []
	remote_only = []
	common = []
	update_LtoR = []
	update_RtoL = []
	common_fail = []

	for L in local_list:
		if L not in remote_list:
			z['local_only'].append(os.path.join(LOCAL,L))
		else:
			common.append(L)

	for R in remote_list:
		if R not in local_list:
			z['remote_only'].append(os.path.join(REMOTE,R))

	for C in common:
		LC = os.path.join(LOCAL,C)
		RC = os.path.join(REMOTE,C)
		if os.path.isfile(LC) and os.path.isfile(RC):
			TLC = os.path.getmtime(LC)
			TRC = os.path.getmtime(RC)
			if (TLC > TRC):
				z['local_to_remote'].append(LC)
				pass
			elif (TLC < TRC):
				z['remote_to_local'].append(RC)
				pass
			elif os.path.getsize(LC) != os.path.getsize(RC):
				z['failure_size'].append(C)
				pass
		elif os.path.isdir(LC) and os.path.isdir(RC):
			T = mydircmp(LC,RC)
			for x in T['local_only']:   z['local_only'].append(x);
			for x in T['remote_only']: z['remote_only'].append(x);
			for x in T['local_to_remote']: z['local_to_remote'].append(x);
			for x in T['remote_to_local']: z['remote_to_local'].append(x);
			for x in T['failure_type']: z['failure_type'].append(x);
			for x in T['failure_size']: z['failure_size'].append(x);
		else:
			z['failure_type'].append(C)
			pass

	return z

Then the script displays the contents of the path sets from mydircmp(). If files would have to be updated in both directions (not copied because they do not exist, but requiring copying over existing data), the script fails before it makes any changes. This situation usually means that I have performed work on both devices without performing a sync, so I would need to manually compare the file trees and make corrections before attempting to use this script. If there are no such error conditions, the script can then perform the file and directory copies using functions from shutil.

if len(sys.argv) < 2:
	printUsageAndExit()
elif '-c' == sys.argv[1] or '--compare' == sys.argv[1]:
	pass
elif '-f' == sys.argv[1] or '--filecopy' == sys.argv[1]:
	flagDoCopy = 1
else:
	printUsageAndExit()

if len(sys.argv) > 3:
	LOCDIR = sys.argv[2]
	REMDIR = sys.argv[3]
elif len(sys.argv) > 2:
	LOCDIR = '.'
	REMDIR = sys.argv[2]
else:
	LOCDIR = '.'
	REMDIR = DEFAULT_REMOTE_DIR

LOCDIR = os.path.expanduser(LOCDIR)
REMDIR = os.path.expanduser(REMDIR)

Y = mydircmp(LOCDIR,REMDIR)

ltr = len(Y['local_to_remote']) 
rtl = len(Y['remote_to_local']) 
if ltr > 0 and rtl > 0:
	print "osync: ERROR: updates would be BIDIRECTIONAL!"
	print "---------- Update Local to Remote ----------"
	for x in Y['local_to_remote']:
		print "\t", x
	print "---------- Update Remote to Local ----------"
	for x in Y['remote_to_local']:
		print "\t", x
elif len(Y['failure_type']):
	print "osync: ERROR: items have different types:"
	for x in Y['failure_type']:
		print "\t", x
elif len(Y['failure_size']):
	print "osync: ERROR: files have different sizes but same modification times:"
	for x in Y['failure_size']:
		print "\t", x
else:
	if ltr > 0:
		print "---------- Update Local to Remote ----------"
		for x in Y['local_to_remote']:
			print "\t", x, " --> ", REMDIR + x[len(LOCDIR):]
			if flagDoCopy:
				shutil.copy2(x, REMDIR + x[len(LOCDIR):])
	if rtl > 0:
		print "---------- Update Remote to Local ----------"
		for x in Y['remote_to_local']:
			print "\t", x, " --> ", LOCDIR + x[len(REMDIR):]
			if flagDoCopy:
				shutil.copy2(x, LOCDIR + x[len(REMDIR):])
	if len(Y['local_only']) > 0:
		print "---------- Copy   Local to Remote ----------"
		for x in Y['local_only']:
			print "\t", x, " --> ", REMDIR + x[len(LOCDIR):]
			if flagDoCopy:
				if os.path.isdir(x):
					shutil.copytree(x, REMDIR + x[len(LOCDIR):])
				else:
					shutil.copy2(x, REMDIR + x[len(LOCDIR):])
	if len(Y['remote_only']) > 0:
		print "---------- Copy   Remote to Local ----------"
		for x in Y['remote_only']:
			print "\t", x, " --> ", LOCDIR + x[len(REMDIR):]
			if flagDoCopy:
				if os.path.isdir(x):
					shutil.copytree(x, LOCDIR + x[len(REMDIR):])
				else:
					shutil.copy2(x, LOCDIR + x[len(REMDIR):])

There is a problem with performing time comparisons between the OS X file system and the FAT32 system. The timestamps for FAT32 files have a coarse 2-second resolution. OS X has a finer granularity, so it is possible to perform a file copy and have files appear to still require a copy in either direction. To get around that problem, after the script performs its updates, it will perform a recursive scan of the local (OS X) directory tree to copy the FAT32 file times to the local files.

def fixLocalModificationTimes(path0):
	"""FAT and FAT32 have a 2-second resolution for file modification times. Copy those times to local files."""
	for root,dirs,files in os.walk(path0):
		for x in files:
			localPath = os.path.join(root,x)
			remotePath = os.path.join(REMDIR + root[len(path0):],x)
#			print 'path0 = ', path0, '; localPath = ', localPath, '; remotePath = ', remotePath
			rfi = os.stat(remotePath)
			os.utime(localPath,(rfi.st_atime, rfi.st_mtime))

The other thing I dislike about FAT32 copies is the lack of a real executable permission bit. I do not want all of the local data files to have their executable bits after the update, but I need various script files to keep their executable bits. I have tried several methods for this. At first, I tried clearing all executable bits and having a dictionary of specific paths for which I wanted the executable bit set. This works well, but I tired of updating the dictionary. Today I wrote code to use the subprocess.Popen() to call /usr/bin/file on each path, then use a regex on the output to decide whether a file should be executable or not. This works well, but it is noticeably slow on a file tree with over a thousand files. Then I implemented a hybrid method. For each path with a known extension, I look up the extension in a table and set the executable bit appropriately. Then I only have to use Popen() for the remaining paths (mostly paths without extensions), which does not cause a noticeable delay.

executableBitByExtension = {
	# common Unix executable types
	'.py' : 1, '.sh' : 1, '.rb' : 1, '.cgi' : 1,

	# Mark Windows executables as regular files for OS X.
	'.exe' : 0,

	# Text, data, media
	'.html' : 0, '.css' : 0, '.js' : 0, '.xml' : 0, '.xsl' : 0, '.json' : 0,
	'.txt' : 0, '.csv' : 0, '.rtf' : 0, '.output' : 0,
	'.tar' : 0, '.gz' : 0, '.tgz' : 0, '.zip' : 0, '.dmg' : 0,
	'.jpg' : 0, '.jpeg' : 0, '.png' : 0, '.gif' : 0, '.tif' : 0, '.bmp' : 0,
	'.mp3' : 0, '.mp4' : 0, '.m4a' : 0, '.m4v' : 0, '.wav' : 0, '.avi' : 0, '.mov' : 0,
	'.pdf' : 0, '.doc' : 0, '.docx' : 0, '.xls' : 0, '.xlsx' : 0, '.ppt' : 0,
	'.c' : 0, '.cpp' : 0, '.m' : 0, '.scheme' : 0
}

def fixPermissionBits(path0,overrides):
	"""Set permission bits in a directory tree to reasonable values while allowing overrides for specific paths."""

	executableFileTypeRegex = re.compile('(script text executable)|(Mach-O executable)')
	if not overrides: overrides = {}
	DIRBITS  = 0o755
	FILEBITS = 0o644
	os.chmod(path0,DIRBITS)
	for root,dirs,files in os.walk(path0):
		for x in dirs:
			path1 = os.path.join(root,x)
			os.chmod(path1,overrides.get(path1,DIRBITS))
		for x in files:
			fext = os.path.splitext(x)[1].lower()
			path1 = os.path.join(root,x)
			if path1 in overrides:
				os.chmod(path1,overrides[path1])
			elif fext in executableBitByExtension:
				if executableBitByExtension[fext]:
					os.chmod(path1,EXEBITS)
				else:
					os.chmod(path1,FILEBITS)
			elif '.DS_Store' == os.path.split(path1)[1]:
				# If the filename begins with a dot, os.path.splitext() does not recognize it as an extension.
				# Could we assume that dot-files are never executable?
				os.chmod(path1,FILEBITS)
			else:
#				print "Using subprocess to determine type of ", path1
				p = subprocess.Popen(['/usr/bin/file', path1],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
				output, errors = p.communicate()
				if not executableFileTypeRegex.search(output):
					os.chmod(path1,FILEBITS)
				else:
					os.chmod(path1,EXEBITS)

So I save this code to a script called async. Every time I mount the remote drive, I cd to the local directory and I run ./async -c to compare the directory trees. If I see an error I fix it. If I see bidirectional copies because I renamed a file on one side, then I rename those files on the other side and run ./async -c again. If I see bidirectional updates, then I know I should compare the file contents, perhaps with /usr/bin/diff. Usually, though, there are no issues and I can immediately run ./async -f to perform the sync.