765 lines
22 KiB
ObjectPascal

(* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is TurboPower Abbrevia
*
* The Initial Developer of the Original Code is
* TurboPower Software
*
* Portions created by the Initial Developer are Copyright (C) 1997-2002
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* ***** END LICENSE BLOCK ***** *)
{*********************************************************}
{* ABBREVIA: AbDfInW.pas *}
{*********************************************************}
{* Deflate input sliding window unit *}
{*********************************************************}
unit AbDfInW;
{$I AbDefine.inc}
interface
uses
Classes,
AbDfBase;
{Notes: TdfInputWindow implements a sliding window on data for the
LZ77 dictionary encoding.
The stream passed to the class is automatically read when
required to keep the internal buffer fully loaded.
}
type
TAbDfMatch = record
maLen : integer;
maDist : integer;
maLit : AnsiChar;
end;
type
PAbPointerList = ^TAbPointerList;
TAbPointerList = array[0..MaxInt div SizeOf(Pointer) - 1] of Pointer;
TAbDfInputWindow = class
private
FAdvanceStart : boolean;
FBuffer : PAnsiChar;
FBufferEnd : PAnsiChar;
FBytesUsed : longint;
FChainLen : integer;
FHashChains : PAbPointerList;
FHashHeads : PAbPointerList;
FHashIndex : integer;
FChecksum : longint;
FCurrent : PAnsiChar;
FLookAheadEnd : PAnsiChar;
FMaxMatchLen : integer;
FMustSlide : boolean;
FOnProgress : TAbProgressStep;
FSlidePoint : PAnsiChar;
FStart : PAnsiChar;
FStartOffset : longint;
FStream : TStream;
FStreamSize : Int64;
FUseCRC32 : boolean;
FUseDeflate64 : boolean;
FWinMask : integer;
FWinSize : integer;
protected
function iwGetChecksum : longint;
procedure iwReadFromStream;
procedure iwSetCapacity(aValue : longint);
procedure iwSlide;
public
constructor Create(aStream : TStream;
aStreamSize : Int64;
aWinSize : integer;
aChainLength : integer;
aUseDeflate64 : boolean;
aUseCRC32 : boolean);
destructor Destroy; override;
procedure Advance(aCount : integer;
aHashCount : integer);
procedure AdvanceByOne;
function FindLongestMatch(aAmpleLength : integer;
var aMatch : TAbDfMatch;
const aPrevMatch : TAbDfMatch) : boolean;
function GetNextChar : AnsiChar;
function GetNextKeyLength : integer;
function Position : longint;
procedure ReadBuffer(var aBuffer; aCount : longint;
aOffset : Int64);
property ChainLen : integer read FChainLen write FChainLen;
property Checksum : longint read iwGetChecksum;
property OnProgress : TAbProgressStep
read FOnProgress write FOnProgress;
end;
implementation
uses
SysUtils;
{Notes:
Meaning of the internal pointers:
|----------+===================+==+--------------------------|
| | | | |
FBuffer FStart FCurrent FLookAheadEnd FBufferEnd
FCurrent is the current match position. The valid data that
can be matched is between FStart and FLookAheadEnd, The data
between FStart and FCurrent has already been seen; the data
between FCurrent and FLookAheadEnd can be used for matching.
The buffer size depends on the requested window size (a
multiple of 1KB, up to 32KB for deflate, up to 64KB for
deflate64) and the lookahead size (up to 258 bytes for deflate
and 64KB for deflate64.)
The window of data continuously slides to the right, and is
slid back to FBuffer whenever FStart reaches a point 16KB
away, this point being given by FSlidePoint.
The hash table:
This is a chained hash table with some peculiarities. First
the table itself, FHashHeads. It contains pointers to strings
in the window buffer, not to chains. The chains are held is a
separate structure, FHashChains. The hash function on the
three-character keys is a Rabin-Karp function:
((((Ch1 shl 5) xor Ch2) shl 5) xor Ch3) and $3FFF
designed so that a running hash value can be kept and
calculated per character. The hash table is $4000 elements
long (obviously, given the hash function).
On insertion, the previous pointer in the hash table at the
calculated index is saved and replaced by the new pointer. The
old pointer is saved in the chains array. This has the same
number of elements as the sliding window has characters. The
pointer is placed at (Ptr and (WindowsSize-1)) overwriting the
value that's already there. In this fashion the individual
chains in the standard hash table are interwoven with each
other in this hash table, like a skein of threads.
}
const
c_HashCount = $4000; {the number of hash entries}
c_HashMask = c_HashCount - 1; {a mask for the hash function}
c_HashShift = 5; {shift value for the hash function}
{===TAbDfInputWindow=================================================}
constructor TAbDfInputWindow.Create(aStream : TStream;
aStreamSize : Int64;
aWinSize : integer;
aChainLength : integer;
aUseDeflate64 : boolean;
aUseCRC32 : boolean);
begin
{create the ancestor}
inherited Create;
{save parameters}
FStreamSize := aStreamSize;
FWinSize := aWinSize;
FWinMask := aWinSize - 1;
FStream := aStream;
FChainLen := aChainLength;
FUseDeflate64 := aUseDeflate64;
FUseCRC32 := aUseCRC32;
if aUseCRC32 then
FChecksum := -1 { CRC32 starts off with all bits set }
else
FCheckSum := 1; { Adler32 starts off with a value of 1 }
{set capacity of sliding window}
iwSetCapacity(aWinSize);
{create the hash table, first the hash table itself (and set all
entries to nil)}
FHashHeads := AllocMem(c_HashCount * sizeof(pointer));
{..now the chains (there's no need to set the entries to nil, since
the chain entries get fed from the head entries before searching)}
GetMem(FHashChains, aWinSize * sizeof(pointer));
{read the first chunk of data from the stream}
FMustSlide := true;
iwReadFromStream;
{if there are at least two bytes, prime the hash index}
if ((FLookAheadEnd - FBuffer) >= 2) then
FHashIndex := ((longint(FBuffer[0]) shl c_HashShift) xor
longint(FBuffer[1])) and
c_HashMask;
end;
{--------}
destructor TAbDfInputWindow.Destroy;
begin
{free the hash table}
FreeMem(FHashHeads);
FreeMem(FHashChains);
{free the buffer}
FreeMem(FBuffer);
{destroy the ancestor}
inherited Destroy;
end;
{--------}
procedure TAbDfInputWindow.Advance(aCount : integer;
aHashCount : integer);
var
i : integer;
ByteCount : integer;
Percent : integer;
HashChains: PAbPointerList;
HashHeads : PAbPointerList;
HashInx : integer;
CurPos : PAnsiChar;
begin
Assert((FLookAheadEnd - FCurrent) >= aCount,
'TAbDfInputWindow.Advance: seem to be advancing into the unknown');
Assert((aHashCount = aCount) or (aHashCount = pred(aCount)),
'TAbDfInputWindow.Advance: the parameters are plain wrong');
{use local var for speed}
CurPos := FCurrent;
{advance the current pointer if needed}
if (aCount > aHashCount) then
inc(CurPos);
{make sure we update the hash table; remember that the string[3] at
the current position has already been added to the hash table (for
notes on updating the hash table, see FindLongestMatch}
{use local vars for speed}
HashChains := FHashChains;
HashHeads := FHashHeads;
HashInx := FHashIndex;
{update the hash table}
for i := 0 to pred(aHashCount) do begin
HashInx :=
((HashInx shl c_HashShift) xor longint(CurPos[2])) and
c_HashMask;
HashChains^[longint(CurPos) and FWinMask] :=
HashHeads^[HashInx];
HashHeads^[HashInx] := CurPos;
inc(CurPos);
end;
{replace old values}
FHashChains := HashChains;
FHashHeads := HashHeads;
FHashIndex := HashInx;
FCurrent := CurPos;
{if we've seen at least FWinSize bytes...}
if FAdvanceStart then begin
{advance the start of the sliding window}
inc(FStart, aCount);
inc(FStartOffset, aCount);
{check to see if we have advanced into the slide zone}
if FMustSlide and (FStart >= FSlidePoint) then
iwSlide;
end
{otherwise check to see if we've seen at least FWinSize bytes}
else if ((CurPos - FStart) >= FWinSize) then begin
FAdvanceStart := true;
{note: we can't advance automatically aCount bytes here, we need
to calculate the actual count}
ByteCount := (CurPos - FWinSize) - FStart;
inc(FStart, ByteCount);
inc(FStartOffset, ByteCount);
end;
{show progress}
if Assigned(FOnProgress) then begin
inc(FBytesUsed, aCount);
if ((FBytesUsed and $FFF) = 0) then begin
Percent := Round((100.0 * FBytesUsed) / FStreamSize);
FOnProgress(Percent);
end;
end;
{check to see if we have advanced into the slide zone}
if (FStart >= FSlidePoint) then
iwSlide;
end;
{--------}
procedure TAbDfInputWindow.AdvanceByOne;
var
Percent : integer;
begin
{advance the current pointer}
inc(FCurrent);
{if we've seen at least FWinSize bytes...}
if FAdvanceStart then begin
{advance the start of the sliding window}
inc(FStart, 1);
inc(FStartOffset, 1);
{check to see if we have advanced into the slide zone}
if FMustSlide and (FStart >= FSlidePoint) then
iwSlide;
end
{otherwise check to see if we've seen FWinSize bytes}
else if ((FCurrent - FStart) = FWinSize) then
FAdvanceStart := true;
{show progress}
if Assigned(FOnProgress) then begin
inc(FBytesUsed, 1);
if ((FBytesUsed and $FFF) = 0) then begin
Percent := Round((100.0 * FBytesUsed) / FStreamSize);
FOnProgress(Percent);
end;
end;
end;
{--------}
function TAbDfInputWindow.FindLongestMatch(aAmpleLength : integer;
var aMatch : TAbDfMatch;
const aPrevMatch : TAbDfMatch)
: boolean;
{Note: this routine implements a greedy algorithm and is by far the
time sink for compression. There are two versions, one written
in Pascal for understanding, one in assembler for speed.
Activate one and only one of the following compiler defines.}
{$IFDEF CPU386}
{$DEFINE UseGreedyAsm}
{$ELSE}
{$DEFINE UseGreedyPascal}
{$ENDIF}
{Check to see that all is correct}
{$IFDEF UseGreedyAsm}
{$IFDEF UseGreedyPascal}
!! Compile Error: only one of the greedy compiler defines can be used
{$ENDIF}
{$ELSE}
{$IFNDEF UseGreedyPascal}
!! Compile Error: one of the greedy compiler defines must be used
{$ENDIF}
{$ENDIF}
type
PLongint = ^longint;
PWord = ^word;
var
MaxLen : longint;
MaxDist : longint;
MaxMatch : integer;
ChainLen : integer;
PrevStrPos : PAnsiChar;
CurPos : PAnsiChar;
{$IFDEF UseGreedyAsm}
CurWord : word;
MaxWord : word;
{$ENDIF}
{$IFDEF UseGreedyPascal}
Len : longint;
MatchStr : PAnsiChar;
CurrentCh : PAnsiChar;
CurCh : AnsiChar;
MaxCh : AnsiChar;
{$ENDIF}
begin
{calculate the hash index for the current position; using the
Rabin-Karp algorithm this is equal to the previous index less the
effect of the character just lost plus the effect of the character
just gained}
CurPos := FCurrent;
FHashIndex :=
((FHashIndex shl c_HashShift) xor longint(CurPos[2])) and
c_HashMask;
{get the head of the hash chain: this is the position in the sliding
window of the previous 3-character string with this hash value}
PrevStrPos := FHashHeads^[FHashIndex];
{set the head of the hash chain equal to our current position}
FHashHeads^[FHashIndex] := CurPos;
{update the chain itself: set the entry for this position equal to
the previous string position}
FHashChains^[longint(CurPos) and FWinMask] := PrevStrPos;
{calculate the maximum match we could do at this position}
MaxMatch := (FLookAheadEnd - CurPos);
if (MaxMatch > FMaxMatchLen) then
MaxMatch := FMaxMatchLen;
if (aAmpleLength > MaxMatch) then
aAmpleLength := MaxMatch;
{calculate the current match length}
if (aPrevMatch.maLen = 0) then
MaxLen := 2
else begin
if (MaxMatch < aPrevMatch.maLen) then begin
Result := false;
aMatch.maLen := 0;
aMatch.maLit := CurPos^;
Exit;
end;
MaxLen := aPrevMatch.maLen;
end;
{get the bytes at the current position and at the end of the maximum
match we have to better}
{$IFDEF UseGreedyAsm}
CurWord := PWord(CurPos)^;
MaxWord := PWord(CurPos + pred(MaxLen))^;
{$ENDIF}
{$IFDEF UseGreedyPascal}
CurCh := CurPos^;
MaxCh := (CurPos + pred(MaxLen))^;
{$ENDIF}
{set the chain length to search based on the current maximum match
(basically: if we've already satisfied the ample length
requirement, don't search as far)}
if (MaxLen >= aAmpleLength) then
ChainLen := FChainLen div 4
else
ChainLen := FChainLen;
{get ready for the loop}
{$IFDEF DefeatWarnings}
MaxDist := 0;
{$ENDIF}
{$IFDEF UseGreedyAsm} { slip into assembler for speed...}
asm
push ebx { save those registers we should}
push esi
push edi
mov ebx, Self { ebx will store the Self pointer}
mov edi, PrevStrPos { edi => previous string}
mov esi, CurPos { esi => current string}
@@TestThisPosition:
{ check previous string is in range}
or edi, edi
je @@Exit
cmp edi, [ebx].TAbDfInputWindow.FStart
jb @@Exit
cmp edi, CurPos
jae @@Exit
mov ax, [edi] { check previous string starts with same}
cmp CurWord, ax { two bytes as current}
jne @@GetNextPosition { ..nope, they don't match}
mov edx, edi { check previous string ends with same}
add edi, MaxLen { two bytes as current (by "ends" we}
dec edi { mean the last two bytes at the}
mov ax, [edi] { current match length)}
cmp MaxWord, ax
mov edi, edx
jne @@GetNextPosition { ..nope, they don't match}
push edi { compare the previous string with the}
push esi { current string}
mov eax, MaxMatch
add edi, 2 { (we've already checked that the first}
sub eax, 2 { two characters are the same)}
add esi, 2
mov ecx, eax
@@CmpQuads:
cmp ecx, 4
jb @@CmpSingles
mov edx, [esi]
cmp edx, [edi]
jne @@CmpSingles
add esi, 4
add edi, 4
sub ecx, 4
jnz @@CmpQuads
jmp @@MatchCheck
@@CmpSingles:
or ecx, ecx
jb @@MatchCheck
mov dl, [esi]
cmp dl, [edi]
jne @@MatchCheck
inc esi
inc edi
dec ecx
jnz @@CmpSingles
@@MatchCheck:
sub eax, ecx
add eax, 2
pop esi
pop edi
cmp eax, MaxLen { have we found a longer match?}
jbe @@GetNextPosition { ..no}
mov MaxLen, eax { ..yes, so save it}
mov eax, esi { calculate the dist for this new match}
sub eax, edi
mov MaxDist, eax
cmp eax, aAmpleLength { if this match is ample enough, exit}
jae @@Exit
mov eax, esi { calculate the two bytes at the end of}
add eax, MaxLen { this new match}
dec eax
mov ax, [eax]
mov MaxWord, ax
@@GetNextPosition:
mov eax, ChainLen { we've visited one more link on the}
dec eax { chain, if that's the last one we}
je @@Exit { should visit, exit}
mov ChainLen, eax
{ advance along the chain}
mov edx, [ebx].TAbDfInputWindow.FHashChains
mov eax, [ebx].TAbDfInputWindow.FWinMask
and edi, eax
shl edi, 2
mov edi, [edx+edi]
jmp @@TestThisPosition
@@Exit:
pop edi
pop esi
pop ebx
end;
{$ENDIF}
{$IFDEF UseGreedyPascal}
{for all possible hash nodes in the chain...}
while (FStart <= PrevStrPos) and (PrevStrPos < CurPos) do begin
{if the initial and maximal characters match...}
if (PrevStrPos[0] = CurCh) and
(PrevStrPos[pred(MaxLen)] = MaxCh) then begin
{compare more characters}
Len := 1;
CurrentCh := CurPos + 1;
MatchStr := PrevStrPos + 1;
{compare away, but don't go above the maximum length}
while (Len < MaxMatch) and (MatchStr^ = CurrentCh^) do begin
inc(CurrentCh);
inc(MatchStr);
inc(Len);
end;
{have we reached another maximum for the length?}
if (Len > MaxLen) then begin
MaxLen := Len;
{calculate the distance}
MaxDist := CurPos - PrevStrPos;
MaxCh := CurPos[pred(MaxLen)];
{is the new best length ample enough?}
if MaxLen >= aAmpleLength then
Break;
end;
end;
{have we reached the end of this chain?}
dec(ChainLen);
if (ChainLen = 0) then
Break;
{otherwise move onto the next position}
PrevStrPos := FHashChains^[longint(PrevStrPos) and FWinMask];
end;
{$ENDIF}
{based on the results of our investigation, return the match values}
if (MaxLen < 3) or (MaxLen <= aPrevMatch.maLen) then begin
Result := false;
aMatch.maLen := 0;
aMatch.maLit := CurPos^;
end
else begin
Result := true;
aMatch.maLen := MaxLen;
aMatch.maDist := MaxDist;
aMatch.maLit := CurPos^; { just in case...}
end;
end;
{--------}
function TAbDfInputWindow.GetNextChar : AnsiChar;
begin
Result := FCurrent^;
inc(FCurrent);
end;
{--------}
function TAbDfInputWindow.GetNextKeyLength : integer;
begin
Result := FLookAheadEnd - FCurrent;
if (Result > 3) then
Result := 3;
end;
{--------}
function TAbDfInputWindow.iwGetChecksum : longint;
begin
{the CRC32 checksum algorithm requires a post-conditioning step
after being calculated (the result is NOTted), whereas Adler32 does
not}
if FUseCRC32 then
Result := not FChecksum
else
Result := FChecksum;
end;
{--------}
procedure TAbDfInputWindow.iwReadFromStream;
var
BytesRead : longint;
BytesToRead : longint;
begin
{read some more data into the look ahead zone}
BytesToRead := FBufferEnd - FLookAheadEnd;
BytesRead := FStream.Read(FLookAheadEnd^, BytesToRead);
{if nothing was read, we reached the end of the stream; hence
there's no more need to slide the window since we have all the
data}
if (BytesRead = 0) then
FMustSlide := false
{otherwise something was actually read...}
else begin
{update the checksum}
if FUseCRC32 then
AbUpdateCRCBuffer(FChecksum, FLookAheadEnd^, BytesRead)
else
AbUpdateAdlerBuffer(FChecksum, FLookAheadEnd^, BytesRead);
{reposition the pointer for the end of the lookahead area}
inc(FLookAheadEnd, BytesRead);
end;
end;
{--------}
procedure TAbDfInputWindow.iwSetCapacity(aValue : longint);
var
ActualSize : integer;
begin
{calculate the actual size; this will be the value passed in, plus
the correct look ahead size, plus 16KB}
ActualSize := aValue + (16 * 1024);
if FUseDeflate64 then begin
inc(ActualSize, dfc_MaxMatchLen64);
FMaxMatchLen := dfc_MaxMatchLen64;
end
else begin
inc(ActualSize, dfc_MaxMatchLen);
FMaxMatchLen := dfc_MaxMatchLen;
end;
{get the new buffer}
GetMem(FBuffer, ActualSize);
{set the other buffer pointers}
FStart := FBuffer;
FCurrent := FBuffer;
FLookAheadEnd := FBuffer;
FBufferEnd := FBuffer + ActualSize;
FSlidePoint := FBuffer + (16 * 1024);
end;
{--------}
procedure TAbDfInputWindow.iwSlide;
type
PLongint = ^longint;
var
i : integer;
ByteCount : integer;
Buffer : longint;
ListItem : PLongint;
begin
{move current valid data back to the start of the buffer}
ByteCount := FLookAheadEnd - FStart;
Move(FStart^, FBuffer^, ByteCount);
{reset the various pointers}
ByteCount := FStart - FBuffer;
FStart := FBuffer;
dec(FCurrent, ByteCount);
dec(FLookAheadEnd, ByteCount);
{patch up the hash table: the head pointers}
Buffer := longint(FBuffer);
ListItem := PLongint(@FHashHeads^[0]);
for i := 0 to pred(c_HashCount) do begin
dec(ListItem^, ByteCount);
if (ListItem^ < Buffer) then
ListItem^ := 0;
inc(PAnsiChar(ListItem), sizeof(pointer));
end;
{..the chain pointers}
ListItem := PLongint(@FHashChains^[0]);
for i := 0 to pred(FWinSize) do begin
dec(ListItem^, ByteCount);
if (ListItem^ < Buffer) then
ListItem^ := 0;
inc(PAnsiChar(ListItem), sizeof(pointer));
end;
{now read some more data from the stream}
iwReadFromStream;
end;
{--------}
function TAbDfInputWindow.Position : longint;
begin
Result := (FCurrent - FStart) + FStartOffset;
end;
{--------}
procedure TAbDfInputWindow.ReadBuffer(var aBuffer; aCount : longint;
aOffset : Int64);
var
CurPos : Int64;
begin
CurPos := FStream.Seek(0, soCurrent);
FStream.Seek(aOffSet, soBeginning);
FStream.ReadBuffer(aBuffer, aCount);
FStream.Seek(CurPos, soBeginning);
end;
{====================================================================}
end.