ports//sysutils/boxbackup/work/boxbackup-0.10/lib/backupclient/BackupStoreFileDiff.cpp

// distribution boxbackup-0.10 (svn version: 494)
//  
// Copyright (c) 2003 - 2006
//      Ben Summers and contributors.  All rights reserved.
//  
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
// 3. All use of this software and associated advertising materials must 
//    display the following acknowledgment:
//        This product includes software developed by Ben Summers.
// 4. The names of the Authors may not be used to endorse or promote
//    products derived from this software without specific prior written
//    permission.
// 
// [Where legally impermissible the Authors do not disclaim liability for 
// direct physical injury or death caused solely by defects in the software 
// unless it is modified by a third party.]
// 
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT,
// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//  
//  
//  
// --------------------------------------------------------------------------
//
// File
//		Name:    BackupStoreFileDiff.cpp
//		Purpose: Functions relating to diffing BackupStoreFiles
//		Created: 12/1/04
//
// --------------------------------------------------------------------------

#include "Box.h"

#include <new>
#include <map>

#ifdef HAVE_TIME_H
	#include <time.h>
#elif HAVE_SYS_TIME_H
	#include <sys/time.h>
#endif

#include "BackupStoreFile.h"
#include "BackupStoreFileWire.h"
#include "BackupStoreFileCryptVar.h"
#include "BackupStoreObjectMagic.h"
#include "BackupStoreException.h"
#include "BackupStoreFileEncodeStream.h"
#include "BackupStoreConstants.h"
#include "FileStream.h"
#include "RollingChecksum.h"
#include "MD5Digest.h"
#include "CommonException.h"

#include "MemLeakFindOn.h"

using namespace BackupStoreFileCryptVar;
using namespace BackupStoreFileCreation;

// By default, don't trace out details of the diff as we go along -- would fill up logs significantly.
// But it's useful for the test.
#ifndef NDEBUG
	bool BackupStoreFile::TraceDetailsOfDiffProcess = false;
#endif

static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis);
static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
static void SearchForMatchingBlocks(IOStream &rFile, 
	std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex, 
	int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES],
	DiffTimer *pDiffTimer);
static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);

// sDiffTimerExpired flags when the diff timer has expired. When true, the 
// diff routine should check the wall clock as soon as possible, to determine 
// whether it's time for a keepalive to be sent, or whether the diff has been 
// running for too long and should be terminated.
static bool sDiffTimerExpired = false;


// --------------------------------------------------------------------------
//
// Function
//		Name:    BackupStoreFile::DiffTimerExpired()
//		Purpose: Notifies BackupStoreFile object that the diff operation
//				 timer has expired, which may mean that a keepalive should
//				 be sent, or the diff should be terminated. Called from an
//				 external timer, so it should not do more than set a flag.
//
//		Created: 19/1/06
//
// --------------------------------------------------------------------------
void BackupStoreFile::DiffTimerExpired()
{
	sDiffTimerExpired = true;
}


// --------------------------------------------------------------------------
//
// Function
//		Name:    BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &)
//		Purpose: Move the file pointer in this stream to just before the block index.
//				 Assumes that the stream is at the beginning, seekable, and
//				 reading from the stream is OK.
//		Created: 12/1/04
//
// --------------------------------------------------------------------------
void BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &rStream)
{
	// Size of file
	int64_t fileSize = rStream.BytesLeftToRead();

	// Get header
	file_StreamFormat hdr;

	// Read the header
	if(!rStream.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, IOStream::TimeOutInfinite))
	{
		// Couldn't read header
		THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
	}

	// Check magic number
	if(ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1
#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE
		&& ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V0
#endif
		)
	{
		THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
	}
	
	// Work out where the index is
	int64_t numBlocks = box_ntoh64(hdr.mNumBlocks);
	int64_t blockHeaderPosFromEnd = ((numBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader));
	
	// Sanity check
	if(blockHeaderPosFromEnd > static_cast<int64_t>(fileSize - sizeof(file_StreamFormat)))
	{
		THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
	}
	
	// Seek to that position
	rStream.Seek(0 - blockHeaderPosFromEnd, IOStream::SeekType_End);
	
	// Done. Stream now in right position (as long as the file is formatted correctly)
}


// --------------------------------------------------------------------------
//
// Function
//		Name:    BackupStoreFile::EncodeFileDiff(const char *, int64_t, const BackupStoreFilename &, int64_t, IOStream &, int64_t *)
//		Purpose: Similar to EncodeFile, but takes the object ID of the file it's
//				 diffing from, and the index of the blocks in a stream. It'll then
//				 calculate which blocks can be reused from that old file.
//				 The timeout is the timeout value for reading the diff block index.
//				 If pIsCompletelyDifferent != 0, it will be set to true if the
//				 the two files are completely different (do not share any block), false otherwise.
//				 
//		Created: 12/1/04
//
// --------------------------------------------------------------------------
std::auto_ptr<IOStream> BackupStoreFile::EncodeFileDiff
(
	const char *Filename, int64_t ContainerID,
	const BackupStoreFilename &rStoreFilename, int64_t DiffFromObjectID, 
	IOStream &rDiffFromBlockIndex, int Timeout, DiffTimer *pDiffTimer, 
	int64_t *pModificationTime, bool *pIsCompletelyDifferent)
{
	// Is it a symlink?
	{
		struct stat st;
		if(::lstat(Filename, &st) != 0)
		{
			THROW_EXCEPTION(CommonException, OSFileError)
		}
		if((st.st_mode & S_IFLNK) == S_IFLNK)
		{
			// Don't do diffs for symlinks
			if(pIsCompletelyDifferent != 0)
			{
				*pIsCompletelyDifferent = true;
			}
			return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime);
		}
	}

	// Load in the blocks
	BlocksAvailableEntry *pindex = 0;
	int64_t blocksInIndex = 0;
	bool canDiffFromThis = false;
	LoadIndex(rDiffFromBlockIndex, DiffFromObjectID, &pindex, blocksInIndex, Timeout, canDiffFromThis);
	//TRACE1("Diff: Blocks in index: %lld\n", blocksInIndex);
	
	if(!canDiffFromThis)
	{
		// Don't do diffing...
		if(pIsCompletelyDifferent != 0)
		{
			*pIsCompletelyDifferent = true;
		}
		return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime);
	}
	
	// Pointer to recipe we're going to create
	BackupStoreFileEncodeStream::Recipe *precipe = 0;
	
	try
	{
		// Find which sizes should be scanned
		int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
		FindMostUsedSizes(pindex, blocksInIndex, sizesToScan);
		
		// Flag for reporting to the user
		bool completelyDifferent;
			
		// BLOCK
		{
			// Search the file to find matching blocks
			std::map<int64_t, int64_t> foundBlocks; // map of offset in file to index in block index
			int64_t sizeOfInputFile = 0;
			// BLOCK
			{
				FileStream file(Filename);
				// Get size of file
				sizeOfInputFile = file.BytesLeftToRead();
				// Find all those lovely matching blocks
				SearchForMatchingBlocks(file, foundBlocks, pindex, 
					blocksInIndex, sizesToScan, pDiffTimer);
				
				// Is it completely different?
				completelyDifferent = (foundBlocks.size() == 0);
			}
			
			// Create a recipe -- if the two files are completely different, don't put the from file ID in the recipe.
			precipe = new BackupStoreFileEncodeStream::Recipe(pindex, blocksInIndex, completelyDifferent?(0):(DiffFromObjectID));
			BlocksAvailableEntry *pindexKeptRef = pindex;	// we need this later, but must set pindex == 0 now, because of exceptions
			pindex = 0;		// Recipe now has ownership
			
			// Fill it in
			GenerateRecipe(*precipe, pindexKeptRef, blocksInIndex, foundBlocks, sizeOfInputFile);
		}
		// foundBlocks no longer required
		
		// Create the stream
		std::auto_ptr<IOStream> stream(new BackupStoreFileEncodeStream);
	
		// Do the initial setup
		((BackupStoreFileEncodeStream*)stream.get())->Setup(Filename, precipe, ContainerID, rStoreFilename, pModificationTime);
		precipe = 0;	// Stream has taken ownership of this
		
		// Tell user about completely different status?
		if(pIsCompletelyDifferent != 0)
		{
			*pIsCompletelyDifferent = completelyDifferent;
		}
	
		// Return the stream for the caller
		return stream;
	}
	catch(...)
	{
		// cleanup
		if(pindex != 0)
		{
			::free(pindex);
			pindex = 0;
		}
		if(precipe != 0)
		{
			delete precipe;
			precipe = 0;
		}
		throw;
	}
}

// --------------------------------------------------------------------------
//
// Function
//		Name:    static LoadIndex(IOStream &, int64_t, BlocksAvailableEntry **, int64_t, bool &)
//		Purpose: Read in an index, and decrypt, and store in the in memory block format.
//				 rCanDiffFromThis is set to false if the version of the from file is too old.
//		Created: 12/1/04
//
// --------------------------------------------------------------------------
static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis)
{
	// Reset
	rNumBlocksOut = 0;
	rCanDiffFromThis = false;
	
	// Read header
	file_BlockIndexHeader hdr;
	if(!rBlockIndex.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, Timeout))
	{
		// Couldn't read header
		THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
	}

#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE
	// Check against backwards comptaibility stuff
	if(hdr.mMagicValue == (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V0))
	{
		// Won't diff against old version
		
		// Absorb rest of stream
		char buffer[2048];
		while(rBlockIndex.StreamDataLeft())
		{
			rBlockIndex.Read(buffer, sizeof(buffer), 1000 /* 1 sec timeout */);
		}
		
		// Tell caller
		rCanDiffFromThis = false;
		return;
	}
#endif

	// Check magic
	if(hdr.mMagicValue != (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1))
	{
		THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
	}
	
	// Check that we're not trying to diff against a file which references blocks from another file
	if(((int64_t)box_ntoh64(hdr.mOtherFileID)) != 0)
	{
		THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile)
	}

	// Mark as an acceptable diff.
	rCanDiffFromThis = true;

	// Get basic information
	int64_t numBlocks = box_ntoh64(hdr.mNumBlocks);
	uint64_t entryIVBase = box_ntoh64(hdr.mEntryIVBase);
	
	//TODO: Verify that these sizes look reasonable
	
	// Allocate space for the index
	BlocksAvailableEntry *pindex = (BlocksAvailableEntry*)::malloc(sizeof(BlocksAvailableEntry) * numBlocks);
	if(pindex == 0)
	{
		throw std::bad_alloc();
	}
	
	try
	{	
		for(int64_t b = 0; b < numBlocks; ++b)
		{
			// Read an entry from the stream
			file_BlockIndexEntry entry;
			if(!rBlockIndex.ReadFullBuffer(&entry, sizeof(entry), 0 /* not interested in bytes read if this fails */, Timeout))
			{
				// Couldn't read entry
				THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
			}	
		
			// Calculate IV for this entry
			uint64_t iv = entryIVBase;
			iv += b;
			// Network byte order
			iv = box_hton64(iv);
			sBlowfishDecryptBlockEntry.SetIV(&iv);			
			
			// Decrypt the encrypted section
			file_BlockIndexEntryEnc entryEnc;
			int sectionSize = sBlowfishDecryptBlockEntry.TransformBlock(&entryEnc, sizeof(entryEnc),
					entry.mEnEnc, sizeof(entry.mEnEnc));
			if(sectionSize != sizeof(entryEnc))
			{
				THROW_EXCEPTION(BackupStoreException, BlockEntryEncodingDidntGiveExpectedLength)
			}

			// Check that we're not trying to diff against a file which references blocks from another file
			if(((int64_t)box_ntoh64(entry.mEncodedSize)) <= 0)
			{
				THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile)
			}
			
			// Store all the required information
			pindex[b].mpNextInHashList = 0;	// hash list not set up yet
			pindex[b].mSize = ntohl(entryEnc.mSize);
			pindex[b].mWeakChecksum = ntohl(entryEnc.mWeakChecksum);
			::memcpy(pindex[b].mStrongChecksum, entryEnc.mStrongChecksum, sizeof(pindex[b].mStrongChecksum));
		}
		
		// Store index pointer for called
		ASSERT(ppIndex != 0);
		*ppIndex = pindex;

		// Store number of blocks for caller
		rNumBlocksOut = numBlocks;	

	}
	catch(...)
	{
		// clean up and send the exception along its way
		::free(pindex);
		throw;
	}
}


// --------------------------------------------------------------------------
//
// Function
//		Name:    static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
//		Purpose: Finds the most commonly used block sizes in the index
//		Created: 12/1/04
//
// --------------------------------------------------------------------------
static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
{
	// Array for lengths
	int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];

	// Set arrays to lots of zeros (= unused entries)
	for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l)
	{
		Sizes[l] = 0;
		sizeCounts[l] = 0;
	}

	// Array for collecting sizes
	std::map<int32_t, int64_t> foundSizes;
	
	// Run through blocks and make a count of the entries
	for(int64_t b = 0; b < NumBlocks; ++b)
	{
		// Only if the block size is bigger than the minimum size we'll scan for
		if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)
		{
			// Find entry?
			std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize));
			if(f != foundSizes.end())
			{
				// Increment existing entry
				foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1;
			}
			else
			{
				// New entry
				foundSizes[pIndex[b].mSize] = 1;
			}
		}
	}
	
	// Make the block sizes
	for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i)
	{
		// Find the position of the size in the array
		for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
		{
			// Instead of sorting on the raw count of blocks,
			// take the file area covered by this block size.
			if(i->second * i->first > sizeCounts[t] * Sizes[t])
			{
				// Then this size belong before this entry -- shuffle them up
				for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
				{
					Sizes[s] = Sizes[s-1];
					sizeCounts[s] = sizeCounts[s-1];
				}
				
				// Insert this size
				Sizes[t] = i->first;
				sizeCounts[t] = i->second;
				
				// Shouldn't do any more searching
				break;
			}
		}
	}
	
	// trace the size table in debug builds
#ifndef NDEBUG
	if(BackupStoreFile::TraceDetailsOfDiffProcess)
	{
		for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
		{
			TRACE3("Diff block size %d: %d (count = %lld)\n", t, Sizes[t], sizeCounts[t]);
		}
	}
#endif
}



// --------------------------------------------------------------------------
//
// Function
//		Name:    static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
//		Purpose: Find the matching blocks within the file.
//		Created: 12/1/04
//
// --------------------------------------------------------------------------
static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
	BlocksAvailableEntry *pIndex, int64_t NumBlocks, 
	int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer)
{
	time_t TimeMgmtEpoch   = 0;
	int MaximumDiffingTime = 0;
	int KeepAliveTime      = 0;

	if (pDiffTimer)
	{
		TimeMgmtEpoch      = pDiffTimer->GetTimeMgmtEpoch();
		MaximumDiffingTime = pDiffTimer->GetMaximumDiffingTime();
		KeepAliveTime      = pDiffTimer->GetKeepaliveTime();
	}
	
	std::map<int64_t, int32_t> goodnessOfFit;

	// Allocate the hash lookup table
	BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));

	// Choose a size for the buffer, just a little bit more than the maximum block size
	int32_t bufSize = Sizes[0];
	for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
	{
		if(Sizes[z] > bufSize) bufSize = Sizes[z];
	}
	bufSize += 4;
	ASSERT(bufSize > Sizes[0]);
	ASSERT(bufSize > 0);
	if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024))
	{
		THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
	}
	
	// TODO: Use buffered file class
	// Because we read in the file a scanned block size at a time, it is likely to be
	// inefficient. Probably will be much better to use a buffering IOStream class which
	// reads data in at the size of the filesystem block size.

	// Allocate the buffers.
	uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize);
	uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize);
	try
	{
		// Check buffer allocation
		if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
		{
			// If a buffer got allocated, it will be cleaned up in the catch block
			throw std::bad_alloc();
		}
		
		// Flag to abort the run, if too many blocks are found -- avoid using
		// huge amounts of processor time when files contain many similar blocks.
		bool abortSearch = false;
		
		// Search for each block size in turn
		// NOTE: Do the smallest size first, so that the scheme for adding
		// entries in the found list works as expected and replaces smallers block
		// with larger blocks when it finds matches at the same offset in the file.
		for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s)
		{
			ASSERT(Sizes[s] <= bufSize);
			//TRACE2("Diff pass %d, for block size %d\n", s, Sizes[s]);
			
			// Check we haven't finished
			if(Sizes[s] == 0)
			{
				// empty entry, try next size
				continue;
			}
			
			// Set up the hash table entries
			SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable);
		
			// Shift file position to beginning
			rFile.Seek(0, IOStream::SeekType_Absolute);
			
			// Read first block
			if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s])
			{
				// Size of file too short to match -- do next size
				continue;
			}
			
			// Setup block pointers
			uint8_t *beginnings = pbuffer0;
			uint8_t *endings = pbuffer1;
			int offset = 0;
			
			// Calculate the first checksum, ready for rolling
			RollingChecksum rolling(beginnings, Sizes[s]);
			
			// Then roll, until the file is exhausted
			int64_t fileBlockNumber = 0;
			int64_t fileOffset = 0;
			int rollOverInitialBytes = 0;
			while(true)
			{
				if(sDiffTimerExpired)
				{
					ASSERT(TimeMgmtEpoch > 0);
					ASSERT(pDiffTimer != NULL);
					
					time_t tTotalRunIntvl = time(NULL) - TimeMgmtEpoch;
					
					if(MaximumDiffingTime > 0 && 
						tTotalRunIntvl >= MaximumDiffingTime)
					{
						TRACE0("MaximumDiffingTime reached - "
							"suspending file diff\n");
						abortSearch = true;
						break;
					}
					else if(KeepAliveTime > 0)
					{
						TRACE0("KeepAliveTime reached - "
							"initiating keep-alive\n");
						pDiffTimer->DoKeepAlive();
					}

					sDiffTimerExpired = false;
				}

				// Load in another block of data, and record how big it is
				int bytesInEndings = rFile.Read(endings, Sizes[s]);
				int tmp;

				// Skip any bytes from a previous matched block
				if(rollOverInitialBytes > 0 && offset < bytesInEndings)
				{
					int spaceLeft = bytesInEndings - offset;
					int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;

					rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);

					offset += thisRoll;
					fileOffset += thisRoll;
					rollOverInitialBytes -= thisRoll;

					if(rollOverInitialBytes)
					{
						goto refresh;
					}
				}

				if(goodnessOfFit.count(fileOffset))
				{
					tmp = goodnessOfFit[fileOffset];
				}
				else
				{
					tmp = 0;
				}

				if(tmp >= Sizes[s])
				{
					// Skip over bigger ready-matched blocks completely
					rollOverInitialBytes = tmp;
					int spaceLeft = bytesInEndings - offset;
					int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;

					rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);

					offset += thisRoll;
					fileOffset += thisRoll;
					rollOverInitialBytes -= thisRoll;

					if(rollOverInitialBytes)
					{
						goto refresh;
					}
				}

				while(offset < bytesInEndings)
				{
					// Is current checksum in hash list?
					uint16_t hash = rolling.GetComponentForHashing();
					if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
					{
						if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
						{
							goodnessOfFit[fileOffset] = Sizes[s];

							// Block matched, roll the checksum forward to the next block without doing
							// any more comparisons, because these are pointless (as any more matches will be ignored when
							// the receipe is generated) and just take up valuable processor time. Edge cases are
							// especially nasty, using huge amounts of time and memory.
							int skip = Sizes[s];
							if(offset < bytesInEndings && skip > 0)
							{
								int spaceLeft = bytesInEndings - offset;
								int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;

								rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);

								offset += thisRoll;
								fileOffset += thisRoll;
								skip -= thisRoll;
							}
							// Not all the bytes necessary will have been skipped, so get them
							// skipped after the next block is loaded.
							rollOverInitialBytes = skip;
							
							// End this loop, so the final byte isn't used again
							break;
						}

						int64_t NumBlocksFound = static_cast<int64_t>(
							rFoundBlocks.size());
						int64_t MaxBlocksFound = NumBlocks * 
							BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
						
						if(NumBlocksFound > MaxBlocksFound)
						{
							abortSearch = true;
							break;
						}
					}
					
					// Roll checksum forward
					rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
				
					// Increment offsets
					++offset;
					++fileOffset;
				}
				
				if(abortSearch) break;
				
			refresh:
				// Finished?
				if(bytesInEndings != Sizes[s])
				{
					// No more data in file -- check the final block
					// (Do a copy and paste of 5 lines of code instead of introducing a comparison for
					// each byte of the file)
					uint16_t hash = rolling.GetComponentForHashing();
					if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
					{
						if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
						{
							goodnessOfFit[fileOffset] = Sizes[s];
						}
					}

					// finish
					break;
				}
				
				// Switch buffers, reset offset
				beginnings = endings;
				endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0);	// ie the other buffer
				offset = 0;

				// And count the blocks which have been done
				++fileBlockNumber;
			}

			if(abortSearch) break;
		}
		
		// Free buffers and hash table
		::free(pbuffer1);
		pbuffer1 = 0;
		::free(pbuffer0);
		pbuffer0 = 0;
		::free(phashTable);
		phashTable = 0;
	}
	catch(...)
	{
		// Cleanup and throw
		if(pbuffer1 != 0) ::free(pbuffer1);
		if(pbuffer0 != 0) ::free(pbuffer0);
		if(phashTable != 0) ::free(phashTable);
		throw;
	}
	
#ifndef NDEBUG
	if(BackupStoreFile::TraceDetailsOfDiffProcess)
	{
		// Trace out the found blocks in debug mode
		TRACE0("Diff: list of found blocks\n======== ======== ======== ========\n  Offset   BlkIdx     Size Movement\n");
		for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
		{
			int64_t orgLoc = 0;
			for(int64_t b = 0; b < i->second; ++b)
			{
				orgLoc += pIndex[b].mSize;
			}
			TRACE4("%8lld %8lld %8lld %8lld\n", i->first, i->second,
				(int64_t)(pIndex[i->second].mSize), i->first - orgLoc);
		}
		TRACE0("======== ======== ======== ========\n");
	}
#endif
}


// --------------------------------------------------------------------------
//
// Function
//		Name:    static SetupHashTable(BlocksAvailableEntry *, int64_t, in32_t, BlocksAvailableEntry **)
//		Purpose: Set up the hash table ready for a scan
//		Created: 14/1/04
//
// --------------------------------------------------------------------------
static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable)
{
	// Set all entries in the hash table to zero
	::memset(pHashTable, 0, (sizeof(BlocksAvailableEntry *) * (64*1024)));

	// Scan through the blocks, building the hash table
	for(int64_t b = 0; b < NumBlocks; ++b)
	{
		// Only look at the required block size
		if(pIndex[b].mSize == BlockSize)
		{
			// Get the value under which to hash this entry
			uint16_t hash = RollingChecksum::ExtractHashingComponent(pIndex[b].mWeakChecksum);
			
			// Already present in table?
			if(pHashTable[hash] != 0)
			{
				//TRACE1("Another hash entry for %d found\n", hash);
				// Yes -- need to set the pointer in this entry to the current entry to build the linked list
				pIndex[b].mpNextInHashList = pHashTable[hash];
			}

			// Put a pointer to this entry in the hash table
			pHashTable[hash] = pIndex + b;
		}
	}
}


// --------------------------------------------------------------------------
//
// Function
//		Name:    static bool SecondStageMatch(xxx)
//		Purpose: When a match in the hash table is found, scan for second stage match using strong checksum.
//		Created: 14/1/04
//
// --------------------------------------------------------------------------
static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
	int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
{
	// Check parameters
	ASSERT(pBeginnings != 0);
	ASSERT(pEndings != 0);
	ASSERT(Offset >= 0);
	ASSERT(BlockSize > 0);
	ASSERT(pFirstInHashList != 0);
	ASSERT(pIndex != 0);

#ifndef NDEBUG
	uint16_t DEBUG_Hash = fastSum.GetComponentForHashing();
#endif
	uint32_t Checksum = fastSum.GetChecksum();

	// Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
	BlocksAvailableEntry *scan = pFirstInHashList;
	bool found=false;
	while(scan != 0)
	{
		if(scan->mWeakChecksum == Checksum)
		{
			found = true;
			break;
		}
		scan = scan->mpNextInHashList;
	}
	if(!found)
	{
		return false;
	}

	// Calculate the strong MD5 digest for this block
	MD5Digest strong;
	// Add the data from the beginnings
	strong.Add(pBeginnings + Offset, BlockSize - Offset);
	// Add any data from the endings
	if(Offset > 0)
	{
		strong.Add(pEndings, Offset);
	}
	strong.Finish();
	
	// Then go through the entries in the hash list, comparing with the strong digest calculated
	scan = pFirstInHashList;
	//TRACE0("second stage match\n");
	while(scan != 0)
	{
		//TRACE3("scan size %d, block size %d, hash %d\n", scan->mSize, BlockSize, Hash);
		ASSERT(scan->mSize == BlockSize);
		ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash);
	
		// Compare?
		if(strong.DigestMatches(scan->mStrongChecksum))
		{
			//TRACE0("Match!\n");
			// Found! Add to list of found blocks...
			int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
			int64_t blockIndex = (scan - pIndex);	// pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory
			
			// We do NOT search for smallest blocks first, as this code originally assumed.
			// To prevent this from potentially overwriting a better match, the caller must determine
			// the relative "goodness" of any existing match and this one, and avoid the call if it
			// could be detrimental.
			rFoundBlocks[fileOffset] = blockIndex;
			
			// No point in searching further, report success
			return true;
		}
	
		// Next
		scan = scan->mpNextInHashList;
	}
	
	// Not matched
	return false;
}


// --------------------------------------------------------------------------
//
// Function
//		Name:    static GenerateRecipe(BackupStoreFileEncodeStream::Recipe &, BlocksAvailableEntry *, int64_t, std::map<int64_t, int64_t> &)
//		Purpose: Fills in the recipe from the found block list
//		Created: 15/1/04
//
// --------------------------------------------------------------------------
static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex,
		int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile)
{
	// NOTE: This function could be a lot more sophisiticated. For example, if
	// a small block overlaps a big block like this
	//   ****
	//      *******************************
	// then the small block will be used, not the big one. But it'd be better to
	// just ignore the small block and keep the big one. However, some stats should
	// be gathered about real world files before writing complex code which might
	// go wrong.

	// Initialise a blank instruction
	BackupStoreFileEncodeStream::RecipeInstruction instruction;
	#define RESET_INSTRUCTION			\
	instruction.mSpaceBefore = 0;		\
	instruction.mBlocks = 0;			\
	instruction.mpStartBlock = 0;
	RESET_INSTRUCTION

	// First, a special case for when there are no found blocks
	if(rFoundBlocks.size() == 0)
	{
		// No blocks, just a load of space
		instruction.mSpaceBefore = SizeOfInputFile;
		rRecipe.push_back(instruction);	
	
		#ifndef NDEBUG
		if(BackupStoreFile::TraceDetailsOfDiffProcess)
		{
			TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile);
		}
		#endif
		
		// Don't do anything
		return;
	}

	// Current location
	int64_t loc = 0;

	// Then iterate through the list, generating the recipe
	std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin());
	ASSERT(i != rFoundBlocks.end());	// check logic

	// Counting for debug tracing
#ifndef NDEBUG
	int64_t debug_NewBytesFound = 0;
	int64_t debug_OldBlocksUsed = 0;
#endif
	
	for(; i != rFoundBlocks.end(); ++i)
	{
		// Remember... map is (position in file) -> (index of block in pIndex)
		
		if(i->first < loc)
		{
			// This block overlaps the last one
			continue;
		}
		else if(i->first > loc)
		{
			// There's a gap between the end of the last thing and this block.
			// If there's an instruction waiting, push it onto the list
			if(instruction.mSpaceBefore != 0 || instruction.mpStartBlock != 0)
			{
				rRecipe.push_back(instruction);
			}
			// Start a new instruction, with the gap ready
			RESET_INSTRUCTION
			instruction.mSpaceBefore = i->first - loc;
			// Move location forward to match
			loc += instruction.mSpaceBefore;
#ifndef NDEBUG
			debug_NewBytesFound += instruction.mSpaceBefore;
#endif
		}
		
		// First, does the current instruction need pushing back, because this block is not
		// sequential to the last one?
		if(instruction.mpStartBlock != 0 && (pIndex + i->second) != (instruction.mpStartBlock + instruction.mBlocks))
		{
			rRecipe.push_back(instruction);
			RESET_INSTRUCTION
		}
		
		// Add in this block
		if(instruction.mpStartBlock == 0)
		{
			// This block starts a new instruction
			instruction.mpStartBlock = pIndex + i->second;
			instruction.mBlocks = 1;
		}
		else
		{
			// It continues the previous section of blocks
			instruction.mBlocks += 1;
		}

#ifndef NDEBUG
		debug_OldBlocksUsed++;
#endif

		// Move location forward
		loc += pIndex[i->second].mSize;
	}
	
	// Push the last instruction generated
	rRecipe.push_back(instruction);
	
	// Is there any space left at the end which needs sending?
	if(loc != SizeOfInputFile)
	{
		RESET_INSTRUCTION
		instruction.mSpaceBefore = SizeOfInputFile - loc;
#ifndef NDEBUG
		debug_NewBytesFound += instruction.mSpaceBefore;
#endif
		rRecipe.push_back(instruction);		
	}
	
	// dump out the recipe
#ifndef NDEBUG
	TRACE2("Diff: %lld new bytes found, %lld old blocks used\n", debug_NewBytesFound, debug_OldBlocksUsed);
	if(BackupStoreFile::TraceDetailsOfDiffProcess)
	{
		TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk  NumBlks\n", rRecipe.size());
		{
			for(unsigned int e = 0; e < rRecipe.size(); ++e)
			{
				char b[64];
#ifdef WIN32
				sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
#else
				sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
#endif
				TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?"       -":b, (int64_t)rRecipe[e].mBlocks);
			}
		}
		TRACE0("======== ========= ========\n");
	}
#endif
}
syntax highlighted by Code2HTML, v. 0.9.1