// distribution boxbackup-0.10 (svn version: 494)
//
// Copyright (c) 2003 - 2006
// Ben Summers and contributors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// 3. All use of this software and associated advertising materials must
// display the following acknowledgment:
// This product includes software developed by Ben Summers.
// 4. The names of the Authors may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// [Where legally impermissible the Authors do not disclaim liability for
// direct physical injury or death caused solely by defects in the software
// unless it is modified by a third party.]
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT,
// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
//
//
// --------------------------------------------------------------------------
//
// File
// Name: BackupStoreFileDiff.cpp
// Purpose: Functions relating to diffing BackupStoreFiles
// Created: 12/1/04
//
// --------------------------------------------------------------------------
#include "Box.h"
#include <new>
#include <map>
#ifdef HAVE_TIME_H
#include <time.h>
#elif HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include "BackupStoreFile.h"
#include "BackupStoreFileWire.h"
#include "BackupStoreFileCryptVar.h"
#include "BackupStoreObjectMagic.h"
#include "BackupStoreException.h"
#include "BackupStoreFileEncodeStream.h"
#include "BackupStoreConstants.h"
#include "FileStream.h"
#include "RollingChecksum.h"
#include "MD5Digest.h"
#include "CommonException.h"
#include "MemLeakFindOn.h"
using namespace BackupStoreFileCryptVar;
using namespace BackupStoreFileCreation;
// By default, don't trace out details of the diff as we go along -- would fill up logs significantly.
// But it's useful for the test.
#ifndef NDEBUG
bool BackupStoreFile::TraceDetailsOfDiffProcess = false;
#endif
static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis);
static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES]);
static void SearchForMatchingBlocks(IOStream &rFile,
std::map<int64_t, int64_t> &rFoundBlocks, BlocksAvailableEntry *pIndex,
int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES],
DiffTimer *pDiffTimer);
static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable);
static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings, int Offset, int32_t BlockSize, int64_t FileBlockNumber,
BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks);
static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex, int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile);
// sDiffTimerExpired flags when the diff timer has expired. When true, the
// diff routine should check the wall clock as soon as possible, to determine
// whether it's time for a keepalive to be sent, or whether the diff has been
// running for too long and should be terminated.
static bool sDiffTimerExpired = false;
// --------------------------------------------------------------------------
//
// Function
// Name: BackupStoreFile::DiffTimerExpired()
// Purpose: Notifies BackupStoreFile object that the diff operation
// timer has expired, which may mean that a keepalive should
// be sent, or the diff should be terminated. Called from an
// external timer, so it should not do more than set a flag.
//
// Created: 19/1/06
//
// --------------------------------------------------------------------------
void BackupStoreFile::DiffTimerExpired()
{
sDiffTimerExpired = true;
}
// --------------------------------------------------------------------------
//
// Function
// Name: BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &)
// Purpose: Move the file pointer in this stream to just before the block index.
// Assumes that the stream is at the beginning, seekable, and
// reading from the stream is OK.
// Created: 12/1/04
//
// --------------------------------------------------------------------------
void BackupStoreFile::MoveStreamPositionToBlockIndex(IOStream &rStream)
{
// Size of file
int64_t fileSize = rStream.BytesLeftToRead();
// Get header
file_StreamFormat hdr;
// Read the header
if(!rStream.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, IOStream::TimeOutInfinite))
{
// Couldn't read header
THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
}
// Check magic number
if(ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V1
#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE
&& ntohl(hdr.mMagicValue) != OBJECTMAGIC_FILE_MAGIC_VALUE_V0
#endif
)
{
THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
}
// Work out where the index is
int64_t numBlocks = box_ntoh64(hdr.mNumBlocks);
int64_t blockHeaderPosFromEnd = ((numBlocks * sizeof(file_BlockIndexEntry)) + sizeof(file_BlockIndexHeader));
// Sanity check
if(blockHeaderPosFromEnd > static_cast<int64_t>(fileSize - sizeof(file_StreamFormat)))
{
THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
}
// Seek to that position
rStream.Seek(0 - blockHeaderPosFromEnd, IOStream::SeekType_End);
// Done. Stream now in right position (as long as the file is formatted correctly)
}
// --------------------------------------------------------------------------
//
// Function
// Name: BackupStoreFile::EncodeFileDiff(const char *, int64_t, const BackupStoreFilename &, int64_t, IOStream &, int64_t *)
// Purpose: Similar to EncodeFile, but takes the object ID of the file it's
// diffing from, and the index of the blocks in a stream. It'll then
// calculate which blocks can be reused from that old file.
// The timeout is the timeout value for reading the diff block index.
// If pIsCompletelyDifferent != 0, it will be set to true if the
// the two files are completely different (do not share any block), false otherwise.
//
// Created: 12/1/04
//
// --------------------------------------------------------------------------
std::auto_ptr<IOStream> BackupStoreFile::EncodeFileDiff
(
const char *Filename, int64_t ContainerID,
const BackupStoreFilename &rStoreFilename, int64_t DiffFromObjectID,
IOStream &rDiffFromBlockIndex, int Timeout, DiffTimer *pDiffTimer,
int64_t *pModificationTime, bool *pIsCompletelyDifferent)
{
// Is it a symlink?
{
struct stat st;
if(::lstat(Filename, &st) != 0)
{
THROW_EXCEPTION(CommonException, OSFileError)
}
if((st.st_mode & S_IFLNK) == S_IFLNK)
{
// Don't do diffs for symlinks
if(pIsCompletelyDifferent != 0)
{
*pIsCompletelyDifferent = true;
}
return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime);
}
}
// Load in the blocks
BlocksAvailableEntry *pindex = 0;
int64_t blocksInIndex = 0;
bool canDiffFromThis = false;
LoadIndex(rDiffFromBlockIndex, DiffFromObjectID, &pindex, blocksInIndex, Timeout, canDiffFromThis);
//TRACE1("Diff: Blocks in index: %lld\n", blocksInIndex);
if(!canDiffFromThis)
{
// Don't do diffing...
if(pIsCompletelyDifferent != 0)
{
*pIsCompletelyDifferent = true;
}
return EncodeFile(Filename, ContainerID, rStoreFilename, pModificationTime);
}
// Pointer to recipe we're going to create
BackupStoreFileEncodeStream::Recipe *precipe = 0;
try
{
// Find which sizes should be scanned
int32_t sizesToScan[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
FindMostUsedSizes(pindex, blocksInIndex, sizesToScan);
// Flag for reporting to the user
bool completelyDifferent;
// BLOCK
{
// Search the file to find matching blocks
std::map<int64_t, int64_t> foundBlocks; // map of offset in file to index in block index
int64_t sizeOfInputFile = 0;
// BLOCK
{
FileStream file(Filename);
// Get size of file
sizeOfInputFile = file.BytesLeftToRead();
// Find all those lovely matching blocks
SearchForMatchingBlocks(file, foundBlocks, pindex,
blocksInIndex, sizesToScan, pDiffTimer);
// Is it completely different?
completelyDifferent = (foundBlocks.size() == 0);
}
// Create a recipe -- if the two files are completely different, don't put the from file ID in the recipe.
precipe = new BackupStoreFileEncodeStream::Recipe(pindex, blocksInIndex, completelyDifferent?(0):(DiffFromObjectID));
BlocksAvailableEntry *pindexKeptRef = pindex; // we need this later, but must set pindex == 0 now, because of exceptions
pindex = 0; // Recipe now has ownership
// Fill it in
GenerateRecipe(*precipe, pindexKeptRef, blocksInIndex, foundBlocks, sizeOfInputFile);
}
// foundBlocks no longer required
// Create the stream
std::auto_ptr<IOStream> stream(new BackupStoreFileEncodeStream);
// Do the initial setup
((BackupStoreFileEncodeStream*)stream.get())->Setup(Filename, precipe, ContainerID, rStoreFilename, pModificationTime);
precipe = 0; // Stream has taken ownership of this
// Tell user about completely different status?
if(pIsCompletelyDifferent != 0)
{
*pIsCompletelyDifferent = completelyDifferent;
}
// Return the stream for the caller
return stream;
}
catch(...)
{
// cleanup
if(pindex != 0)
{
::free(pindex);
pindex = 0;
}
if(precipe != 0)
{
delete precipe;
precipe = 0;
}
throw;
}
}
// --------------------------------------------------------------------------
//
// Function
// Name: static LoadIndex(IOStream &, int64_t, BlocksAvailableEntry **, int64_t, bool &)
// Purpose: Read in an index, and decrypt, and store in the in memory block format.
// rCanDiffFromThis is set to false if the version of the from file is too old.
// Created: 12/1/04
//
// --------------------------------------------------------------------------
static void LoadIndex(IOStream &rBlockIndex, int64_t ThisID, BlocksAvailableEntry **ppIndex, int64_t &rNumBlocksOut, int Timeout, bool &rCanDiffFromThis)
{
// Reset
rNumBlocksOut = 0;
rCanDiffFromThis = false;
// Read header
file_BlockIndexHeader hdr;
if(!rBlockIndex.ReadFullBuffer(&hdr, sizeof(hdr), 0 /* not interested in bytes read if this fails */, Timeout))
{
// Couldn't read header
THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
}
#ifndef BOX_DISABLE_BACKWARDS_COMPATIBILITY_BACKUPSTOREFILE
// Check against backwards comptaibility stuff
if(hdr.mMagicValue == (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V0))
{
// Won't diff against old version
// Absorb rest of stream
char buffer[2048];
while(rBlockIndex.StreamDataLeft())
{
rBlockIndex.Read(buffer, sizeof(buffer), 1000 /* 1 sec timeout */);
}
// Tell caller
rCanDiffFromThis = false;
return;
}
#endif
// Check magic
if(hdr.mMagicValue != (int32_t)htonl(OBJECTMAGIC_FILE_BLOCKS_MAGIC_VALUE_V1))
{
THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
}
// Check that we're not trying to diff against a file which references blocks from another file
if(((int64_t)box_ntoh64(hdr.mOtherFileID)) != 0)
{
THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile)
}
// Mark as an acceptable diff.
rCanDiffFromThis = true;
// Get basic information
int64_t numBlocks = box_ntoh64(hdr.mNumBlocks);
uint64_t entryIVBase = box_ntoh64(hdr.mEntryIVBase);
//TODO: Verify that these sizes look reasonable
// Allocate space for the index
BlocksAvailableEntry *pindex = (BlocksAvailableEntry*)::malloc(sizeof(BlocksAvailableEntry) * numBlocks);
if(pindex == 0)
{
throw std::bad_alloc();
}
try
{
for(int64_t b = 0; b < numBlocks; ++b)
{
// Read an entry from the stream
file_BlockIndexEntry entry;
if(!rBlockIndex.ReadFullBuffer(&entry, sizeof(entry), 0 /* not interested in bytes read if this fails */, Timeout))
{
// Couldn't read entry
THROW_EXCEPTION(BackupStoreException, CouldntReadEntireStructureFromStream)
}
// Calculate IV for this entry
uint64_t iv = entryIVBase;
iv += b;
// Network byte order
iv = box_hton64(iv);
sBlowfishDecryptBlockEntry.SetIV(&iv);
// Decrypt the encrypted section
file_BlockIndexEntryEnc entryEnc;
int sectionSize = sBlowfishDecryptBlockEntry.TransformBlock(&entryEnc, sizeof(entryEnc),
entry.mEnEnc, sizeof(entry.mEnEnc));
if(sectionSize != sizeof(entryEnc))
{
THROW_EXCEPTION(BackupStoreException, BlockEntryEncodingDidntGiveExpectedLength)
}
// Check that we're not trying to diff against a file which references blocks from another file
if(((int64_t)box_ntoh64(entry.mEncodedSize)) <= 0)
{
THROW_EXCEPTION(BackupStoreException, CannotDiffAnIncompleteStoreFile)
}
// Store all the required information
pindex[b].mpNextInHashList = 0; // hash list not set up yet
pindex[b].mSize = ntohl(entryEnc.mSize);
pindex[b].mWeakChecksum = ntohl(entryEnc.mWeakChecksum);
::memcpy(pindex[b].mStrongChecksum, entryEnc.mStrongChecksum, sizeof(pindex[b].mStrongChecksum));
}
// Store index pointer for called
ASSERT(ppIndex != 0);
*ppIndex = pindex;
// Store number of blocks for caller
rNumBlocksOut = numBlocks;
}
catch(...)
{
// clean up and send the exception along its way
::free(pindex);
throw;
}
}
// --------------------------------------------------------------------------
//
// Function
// Name: static FindMostUsedSizes(BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
// Purpose: Finds the most commonly used block sizes in the index
// Created: 12/1/04
//
// --------------------------------------------------------------------------
static void FindMostUsedSizes(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
{
// Array for lengths
int64_t sizeCounts[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES];
// Set arrays to lots of zeros (= unused entries)
for(int l = 0; l < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++l)
{
Sizes[l] = 0;
sizeCounts[l] = 0;
}
// Array for collecting sizes
std::map<int32_t, int64_t> foundSizes;
// Run through blocks and make a count of the entries
for(int64_t b = 0; b < NumBlocks; ++b)
{
// Only if the block size is bigger than the minimum size we'll scan for
if(pIndex[b].mSize > BACKUP_FILE_DIFF_MIN_BLOCK_SIZE)
{
// Find entry?
std::map<int32_t, int64_t>::const_iterator f(foundSizes.find(pIndex[b].mSize));
if(f != foundSizes.end())
{
// Increment existing entry
foundSizes[pIndex[b].mSize] = foundSizes[pIndex[b].mSize] + 1;
}
else
{
// New entry
foundSizes[pIndex[b].mSize] = 1;
}
}
}
// Make the block sizes
for(std::map<int32_t, int64_t>::const_iterator i(foundSizes.begin()); i != foundSizes.end(); ++i)
{
// Find the position of the size in the array
for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
{
// Instead of sorting on the raw count of blocks,
// take the file area covered by this block size.
if(i->second * i->first > sizeCounts[t] * Sizes[t])
{
// Then this size belong before this entry -- shuffle them up
for(int s = (BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1); s >= t; --s)
{
Sizes[s] = Sizes[s-1];
sizeCounts[s] = sizeCounts[s-1];
}
// Insert this size
Sizes[t] = i->first;
sizeCounts[t] = i->second;
// Shouldn't do any more searching
break;
}
}
}
// trace the size table in debug builds
#ifndef NDEBUG
if(BackupStoreFile::TraceDetailsOfDiffProcess)
{
for(int t = 0; t < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++t)
{
TRACE3("Diff block size %d: %d (count = %lld)\n", t, Sizes[t], sizeCounts[t]);
}
}
#endif
}
// --------------------------------------------------------------------------
//
// Function
// Name: static SearchForMatchingBlocks(IOStream &, std::map<int64_t, int64_t> &, BlocksAvailableEntry *, int64_t, int32_t[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES])
// Purpose: Find the matching blocks within the file.
// Created: 12/1/04
//
// --------------------------------------------------------------------------
static void SearchForMatchingBlocks(IOStream &rFile, std::map<int64_t, int64_t> &rFoundBlocks,
BlocksAvailableEntry *pIndex, int64_t NumBlocks,
int32_t Sizes[BACKUP_FILE_DIFF_MAX_BLOCK_SIZES], DiffTimer *pDiffTimer)
{
time_t TimeMgmtEpoch = 0;
int MaximumDiffingTime = 0;
int KeepAliveTime = 0;
if (pDiffTimer)
{
TimeMgmtEpoch = pDiffTimer->GetTimeMgmtEpoch();
MaximumDiffingTime = pDiffTimer->GetMaximumDiffingTime();
KeepAliveTime = pDiffTimer->GetKeepaliveTime();
}
std::map<int64_t, int32_t> goodnessOfFit;
// Allocate the hash lookup table
BlocksAvailableEntry **phashTable = (BlocksAvailableEntry **)::malloc(sizeof(BlocksAvailableEntry *) * (64*1024));
// Choose a size for the buffer, just a little bit more than the maximum block size
int32_t bufSize = Sizes[0];
for(int z = 1; z < BACKUP_FILE_DIFF_MAX_BLOCK_SIZES; ++z)
{
if(Sizes[z] > bufSize) bufSize = Sizes[z];
}
bufSize += 4;
ASSERT(bufSize > Sizes[0]);
ASSERT(bufSize > 0);
if(bufSize > (BACKUP_FILE_MAX_BLOCK_SIZE + 1024))
{
THROW_EXCEPTION(BackupStoreException, BadBackupStoreFile)
}
// TODO: Use buffered file class
// Because we read in the file a scanned block size at a time, it is likely to be
// inefficient. Probably will be much better to use a buffering IOStream class which
// reads data in at the size of the filesystem block size.
// Allocate the buffers.
uint8_t *pbuffer0 = (uint8_t *)::malloc(bufSize);
uint8_t *pbuffer1 = (uint8_t *)::malloc(bufSize);
try
{
// Check buffer allocation
if(pbuffer0 == 0 || pbuffer1 == 0 || phashTable == 0)
{
// If a buffer got allocated, it will be cleaned up in the catch block
throw std::bad_alloc();
}
// Flag to abort the run, if too many blocks are found -- avoid using
// huge amounts of processor time when files contain many similar blocks.
bool abortSearch = false;
// Search for each block size in turn
// NOTE: Do the smallest size first, so that the scheme for adding
// entries in the found list works as expected and replaces smallers block
// with larger blocks when it finds matches at the same offset in the file.
for(int s = BACKUP_FILE_DIFF_MAX_BLOCK_SIZES - 1; s >= 0; --s)
{
ASSERT(Sizes[s] <= bufSize);
//TRACE2("Diff pass %d, for block size %d\n", s, Sizes[s]);
// Check we haven't finished
if(Sizes[s] == 0)
{
// empty entry, try next size
continue;
}
// Set up the hash table entries
SetupHashTable(pIndex, NumBlocks, Sizes[s], phashTable);
// Shift file position to beginning
rFile.Seek(0, IOStream::SeekType_Absolute);
// Read first block
if(rFile.Read(pbuffer0, Sizes[s]) != Sizes[s])
{
// Size of file too short to match -- do next size
continue;
}
// Setup block pointers
uint8_t *beginnings = pbuffer0;
uint8_t *endings = pbuffer1;
int offset = 0;
// Calculate the first checksum, ready for rolling
RollingChecksum rolling(beginnings, Sizes[s]);
// Then roll, until the file is exhausted
int64_t fileBlockNumber = 0;
int64_t fileOffset = 0;
int rollOverInitialBytes = 0;
while(true)
{
if(sDiffTimerExpired)
{
ASSERT(TimeMgmtEpoch > 0);
ASSERT(pDiffTimer != NULL);
time_t tTotalRunIntvl = time(NULL) - TimeMgmtEpoch;
if(MaximumDiffingTime > 0 &&
tTotalRunIntvl >= MaximumDiffingTime)
{
TRACE0("MaximumDiffingTime reached - "
"suspending file diff\n");
abortSearch = true;
break;
}
else if(KeepAliveTime > 0)
{
TRACE0("KeepAliveTime reached - "
"initiating keep-alive\n");
pDiffTimer->DoKeepAlive();
}
sDiffTimerExpired = false;
}
// Load in another block of data, and record how big it is
int bytesInEndings = rFile.Read(endings, Sizes[s]);
int tmp;
// Skip any bytes from a previous matched block
if(rollOverInitialBytes > 0 && offset < bytesInEndings)
{
int spaceLeft = bytesInEndings - offset;
int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
offset += thisRoll;
fileOffset += thisRoll;
rollOverInitialBytes -= thisRoll;
if(rollOverInitialBytes)
{
goto refresh;
}
}
if(goodnessOfFit.count(fileOffset))
{
tmp = goodnessOfFit[fileOffset];
}
else
{
tmp = 0;
}
if(tmp >= Sizes[s])
{
// Skip over bigger ready-matched blocks completely
rollOverInitialBytes = tmp;
int spaceLeft = bytesInEndings - offset;
int thisRoll = (rollOverInitialBytes > spaceLeft) ? spaceLeft : rollOverInitialBytes;
rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
offset += thisRoll;
fileOffset += thisRoll;
rollOverInitialBytes -= thisRoll;
if(rollOverInitialBytes)
{
goto refresh;
}
}
while(offset < bytesInEndings)
{
// Is current checksum in hash list?
uint16_t hash = rolling.GetComponentForHashing();
if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
{
if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
{
goodnessOfFit[fileOffset] = Sizes[s];
// Block matched, roll the checksum forward to the next block without doing
// any more comparisons, because these are pointless (as any more matches will be ignored when
// the receipe is generated) and just take up valuable processor time. Edge cases are
// especially nasty, using huge amounts of time and memory.
int skip = Sizes[s];
if(offset < bytesInEndings && skip > 0)
{
int spaceLeft = bytesInEndings - offset;
int thisRoll = (skip > spaceLeft) ? spaceLeft : skip;
rolling.RollForwardSeveral(beginnings+offset, endings+offset, Sizes[s], thisRoll);
offset += thisRoll;
fileOffset += thisRoll;
skip -= thisRoll;
}
// Not all the bytes necessary will have been skipped, so get them
// skipped after the next block is loaded.
rollOverInitialBytes = skip;
// End this loop, so the final byte isn't used again
break;
}
int64_t NumBlocksFound = static_cast<int64_t>(
rFoundBlocks.size());
int64_t MaxBlocksFound = NumBlocks *
BACKUP_FILE_DIFF_MAX_BLOCK_FIND_MULTIPLE;
if(NumBlocksFound > MaxBlocksFound)
{
abortSearch = true;
break;
}
}
// Roll checksum forward
rolling.RollForward(beginnings[offset], endings[offset], Sizes[s]);
// Increment offsets
++offset;
++fileOffset;
}
if(abortSearch) break;
refresh:
// Finished?
if(bytesInEndings != Sizes[s])
{
// No more data in file -- check the final block
// (Do a copy and paste of 5 lines of code instead of introducing a comparison for
// each byte of the file)
uint16_t hash = rolling.GetComponentForHashing();
if(phashTable[hash] != 0 && (goodnessOfFit.count(fileOffset) == 0 || goodnessOfFit[fileOffset] < Sizes[s]))
{
if(SecondStageMatch(phashTable[hash], rolling, beginnings, endings, offset, Sizes[s], fileBlockNumber, pIndex, rFoundBlocks))
{
goodnessOfFit[fileOffset] = Sizes[s];
}
}
// finish
break;
}
// Switch buffers, reset offset
beginnings = endings;
endings = (beginnings == pbuffer0)?(pbuffer1):(pbuffer0); // ie the other buffer
offset = 0;
// And count the blocks which have been done
++fileBlockNumber;
}
if(abortSearch) break;
}
// Free buffers and hash table
::free(pbuffer1);
pbuffer1 = 0;
::free(pbuffer0);
pbuffer0 = 0;
::free(phashTable);
phashTable = 0;
}
catch(...)
{
// Cleanup and throw
if(pbuffer1 != 0) ::free(pbuffer1);
if(pbuffer0 != 0) ::free(pbuffer0);
if(phashTable != 0) ::free(phashTable);
throw;
}
#ifndef NDEBUG
if(BackupStoreFile::TraceDetailsOfDiffProcess)
{
// Trace out the found blocks in debug mode
TRACE0("Diff: list of found blocks\n======== ======== ======== ========\n Offset BlkIdx Size Movement\n");
for(std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin()); i != rFoundBlocks.end(); ++i)
{
int64_t orgLoc = 0;
for(int64_t b = 0; b < i->second; ++b)
{
orgLoc += pIndex[b].mSize;
}
TRACE4("%8lld %8lld %8lld %8lld\n", i->first, i->second,
(int64_t)(pIndex[i->second].mSize), i->first - orgLoc);
}
TRACE0("======== ======== ======== ========\n");
}
#endif
}
// --------------------------------------------------------------------------
//
// Function
// Name: static SetupHashTable(BlocksAvailableEntry *, int64_t, in32_t, BlocksAvailableEntry **)
// Purpose: Set up the hash table ready for a scan
// Created: 14/1/04
//
// --------------------------------------------------------------------------
static void SetupHashTable(BlocksAvailableEntry *pIndex, int64_t NumBlocks, int32_t BlockSize, BlocksAvailableEntry **pHashTable)
{
// Set all entries in the hash table to zero
::memset(pHashTable, 0, (sizeof(BlocksAvailableEntry *) * (64*1024)));
// Scan through the blocks, building the hash table
for(int64_t b = 0; b < NumBlocks; ++b)
{
// Only look at the required block size
if(pIndex[b].mSize == BlockSize)
{
// Get the value under which to hash this entry
uint16_t hash = RollingChecksum::ExtractHashingComponent(pIndex[b].mWeakChecksum);
// Already present in table?
if(pHashTable[hash] != 0)
{
//TRACE1("Another hash entry for %d found\n", hash);
// Yes -- need to set the pointer in this entry to the current entry to build the linked list
pIndex[b].mpNextInHashList = pHashTable[hash];
}
// Put a pointer to this entry in the hash table
pHashTable[hash] = pIndex + b;
}
}
}
// --------------------------------------------------------------------------
//
// Function
// Name: static bool SecondStageMatch(xxx)
// Purpose: When a match in the hash table is found, scan for second stage match using strong checksum.
// Created: 14/1/04
//
// --------------------------------------------------------------------------
static bool SecondStageMatch(BlocksAvailableEntry *pFirstInHashList, RollingChecksum &fastSum, uint8_t *pBeginnings, uint8_t *pEndings,
int Offset, int32_t BlockSize, int64_t FileBlockNumber, BlocksAvailableEntry *pIndex, std::map<int64_t, int64_t> &rFoundBlocks)
{
// Check parameters
ASSERT(pBeginnings != 0);
ASSERT(pEndings != 0);
ASSERT(Offset >= 0);
ASSERT(BlockSize > 0);
ASSERT(pFirstInHashList != 0);
ASSERT(pIndex != 0);
#ifndef NDEBUG
uint16_t DEBUG_Hash = fastSum.GetComponentForHashing();
#endif
uint32_t Checksum = fastSum.GetChecksum();
// Before we go to the expense of the MD5, make sure it's a darn good match on the checksum we already know.
BlocksAvailableEntry *scan = pFirstInHashList;
bool found=false;
while(scan != 0)
{
if(scan->mWeakChecksum == Checksum)
{
found = true;
break;
}
scan = scan->mpNextInHashList;
}
if(!found)
{
return false;
}
// Calculate the strong MD5 digest for this block
MD5Digest strong;
// Add the data from the beginnings
strong.Add(pBeginnings + Offset, BlockSize - Offset);
// Add any data from the endings
if(Offset > 0)
{
strong.Add(pEndings, Offset);
}
strong.Finish();
// Then go through the entries in the hash list, comparing with the strong digest calculated
scan = pFirstInHashList;
//TRACE0("second stage match\n");
while(scan != 0)
{
//TRACE3("scan size %d, block size %d, hash %d\n", scan->mSize, BlockSize, Hash);
ASSERT(scan->mSize == BlockSize);
ASSERT(RollingChecksum::ExtractHashingComponent(scan->mWeakChecksum) == DEBUG_Hash);
// Compare?
if(strong.DigestMatches(scan->mStrongChecksum))
{
//TRACE0("Match!\n");
// Found! Add to list of found blocks...
int64_t fileOffset = (FileBlockNumber * BlockSize) + Offset;
int64_t blockIndex = (scan - pIndex); // pointer arthmitic is frowned upon. But most efficient way of doing it here -- alternative is to use more memory
// We do NOT search for smallest blocks first, as this code originally assumed.
// To prevent this from potentially overwriting a better match, the caller must determine
// the relative "goodness" of any existing match and this one, and avoid the call if it
// could be detrimental.
rFoundBlocks[fileOffset] = blockIndex;
// No point in searching further, report success
return true;
}
// Next
scan = scan->mpNextInHashList;
}
// Not matched
return false;
}
// --------------------------------------------------------------------------
//
// Function
// Name: static GenerateRecipe(BackupStoreFileEncodeStream::Recipe &, BlocksAvailableEntry *, int64_t, std::map<int64_t, int64_t> &)
// Purpose: Fills in the recipe from the found block list
// Created: 15/1/04
//
// --------------------------------------------------------------------------
static void GenerateRecipe(BackupStoreFileEncodeStream::Recipe &rRecipe, BlocksAvailableEntry *pIndex,
int64_t NumBlocks, std::map<int64_t, int64_t> &rFoundBlocks, int64_t SizeOfInputFile)
{
// NOTE: This function could be a lot more sophisiticated. For example, if
// a small block overlaps a big block like this
// ****
// *******************************
// then the small block will be used, not the big one. But it'd be better to
// just ignore the small block and keep the big one. However, some stats should
// be gathered about real world files before writing complex code which might
// go wrong.
// Initialise a blank instruction
BackupStoreFileEncodeStream::RecipeInstruction instruction;
#define RESET_INSTRUCTION \
instruction.mSpaceBefore = 0; \
instruction.mBlocks = 0; \
instruction.mpStartBlock = 0;
RESET_INSTRUCTION
// First, a special case for when there are no found blocks
if(rFoundBlocks.size() == 0)
{
// No blocks, just a load of space
instruction.mSpaceBefore = SizeOfInputFile;
rRecipe.push_back(instruction);
#ifndef NDEBUG
if(BackupStoreFile::TraceDetailsOfDiffProcess)
{
TRACE1("Diff: Default recipe generated, %lld bytes of file\n", SizeOfInputFile);
}
#endif
// Don't do anything
return;
}
// Current location
int64_t loc = 0;
// Then iterate through the list, generating the recipe
std::map<int64_t, int64_t>::const_iterator i(rFoundBlocks.begin());
ASSERT(i != rFoundBlocks.end()); // check logic
// Counting for debug tracing
#ifndef NDEBUG
int64_t debug_NewBytesFound = 0;
int64_t debug_OldBlocksUsed = 0;
#endif
for(; i != rFoundBlocks.end(); ++i)
{
// Remember... map is (position in file) -> (index of block in pIndex)
if(i->first < loc)
{
// This block overlaps the last one
continue;
}
else if(i->first > loc)
{
// There's a gap between the end of the last thing and this block.
// If there's an instruction waiting, push it onto the list
if(instruction.mSpaceBefore != 0 || instruction.mpStartBlock != 0)
{
rRecipe.push_back(instruction);
}
// Start a new instruction, with the gap ready
RESET_INSTRUCTION
instruction.mSpaceBefore = i->first - loc;
// Move location forward to match
loc += instruction.mSpaceBefore;
#ifndef NDEBUG
debug_NewBytesFound += instruction.mSpaceBefore;
#endif
}
// First, does the current instruction need pushing back, because this block is not
// sequential to the last one?
if(instruction.mpStartBlock != 0 && (pIndex + i->second) != (instruction.mpStartBlock + instruction.mBlocks))
{
rRecipe.push_back(instruction);
RESET_INSTRUCTION
}
// Add in this block
if(instruction.mpStartBlock == 0)
{
// This block starts a new instruction
instruction.mpStartBlock = pIndex + i->second;
instruction.mBlocks = 1;
}
else
{
// It continues the previous section of blocks
instruction.mBlocks += 1;
}
#ifndef NDEBUG
debug_OldBlocksUsed++;
#endif
// Move location forward
loc += pIndex[i->second].mSize;
}
// Push the last instruction generated
rRecipe.push_back(instruction);
// Is there any space left at the end which needs sending?
if(loc != SizeOfInputFile)
{
RESET_INSTRUCTION
instruction.mSpaceBefore = SizeOfInputFile - loc;
#ifndef NDEBUG
debug_NewBytesFound += instruction.mSpaceBefore;
#endif
rRecipe.push_back(instruction);
}
// dump out the recipe
#ifndef NDEBUG
TRACE2("Diff: %lld new bytes found, %lld old blocks used\n", debug_NewBytesFound, debug_OldBlocksUsed);
if(BackupStoreFile::TraceDetailsOfDiffProcess)
{
TRACE1("Diff: Recipe generated (size %d)\n======== ========= ========\nSpace b4 FirstBlk NumBlks\n", rRecipe.size());
{
for(unsigned int e = 0; e < rRecipe.size(); ++e)
{
char b[64];
#ifdef WIN32
sprintf(b, "%8I64d", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
#else
sprintf(b, "%8lld", (int64_t)(rRecipe[e].mpStartBlock - pIndex));
#endif
TRACE3("%8lld %s %8lld\n", rRecipe[e].mSpaceBefore, (rRecipe[e].mpStartBlock == 0)?" -":b, (int64_t)rRecipe[e].mBlocks);
}
}
TRACE0("======== ========= ========\n");
}
#endif
}
syntax highlighted by Code2HTML, v. 0.9.1