lasarus_compotents/Abbrevia/source/AbDfInW.pas

(* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is TurboPower Abbrevia
 *
 * The Initial Developer of the Original Code is
 * TurboPower Software
 *
 * Portions created by the Initial Developer are Copyright (C) 1997-2002
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *
 * ***** END LICENSE BLOCK ***** *)

{*********************************************************}
{* ABBREVIA: AbDfInW.pas                                 *}
{*********************************************************}
{* Deflate input sliding window unit                     *}
{*********************************************************}

unit AbDfInW;

{$I AbDefine.inc}

interface

uses
  Classes,
  AbDfBase;

{Notes: TdfInputWindow implements a sliding window on data for the
        LZ77 dictionary encoding.

        The stream passed to the class is automatically read when
        required to keep the internal buffer fully loaded.
        }

type
  TAbDfMatch = record
    maLen  : integer;
    maDist : integer;
    maLit  : AnsiChar;
  end;

type
  PAbPointerList = ^TAbPointerList;
  TAbPointerList = array[0..MaxInt div SizeOf(Pointer) - 1] of Pointer;

  TAbDfInputWindow = class
    private
      FAdvanceStart : boolean;
      FBuffer       : PAnsiChar;
      FBufferEnd    : PAnsiChar;
      FBytesUsed    : longint;
      FChainLen     : integer;
      FHashChains   : PAbPointerList;
      FHashHeads    : PAbPointerList;
      FHashIndex    : integer;
      FChecksum     : longint;
      FCurrent      : PAnsiChar;
      FLookAheadEnd : PAnsiChar;
      FMaxMatchLen  : integer;
      FMustSlide    : boolean;
      FOnProgress   : TAbProgressStep;
      FSlidePoint   : PAnsiChar;
      FStart        : PAnsiChar;
      FStartOffset  : longint;
      FStream       : TStream;
      FStreamSize   : Int64;
      FUseCRC32     : boolean;
      FUseDeflate64 : boolean;
      FWinMask      : integer;
      FWinSize      : integer;
    protected
      function iwGetChecksum : longint;
      procedure iwReadFromStream;
      procedure iwSetCapacity(aValue : longint);
      procedure iwSlide;
    public
      constructor Create(aStream       : TStream;
                         aStreamSize   : Int64;
                         aWinSize      : integer;
                         aChainLength  : integer;
                         aUseDeflate64 : boolean;
                         aUseCRC32     : boolean);
      destructor Destroy; override;

      procedure Advance(aCount     : integer;
                        aHashCount : integer);
      procedure AdvanceByOne;
      function FindLongestMatch(aAmpleLength : integer;
                            var aMatch       : TAbDfMatch;
                          const aPrevMatch   : TAbDfMatch) : boolean;
      function GetNextChar : AnsiChar;
      function GetNextKeyLength : integer;
      function Position : longint;
      procedure ReadBuffer(var aBuffer; aCount  : longint;
                                        aOffset : Int64);

      property ChainLen : integer read FChainLen write FChainLen;
      property Checksum : longint read iwGetChecksum;
      property OnProgress : TAbProgressStep
                  read FOnProgress write FOnProgress;
  end;

implementation

uses
  SysUtils;

{Notes:
        Meaning of the internal pointers:

        |----------+===================+==+--------------------------|
        |          |                   |  |                          |
        FBuffer    FStart       FCurrent  FLookAheadEnd     FBufferEnd

        FCurrent is the current match position. The valid data that
        can be matched is between FStart and FLookAheadEnd, The data
        between FStart and FCurrent has already been seen; the data
        between FCurrent and FLookAheadEnd can be used for matching.

        The buffer size depends on the requested window size (a
        multiple of 1KB, up to 32KB for deflate, up to 64KB for
        deflate64) and the lookahead size (up to 258 bytes for deflate
        and 64KB for deflate64.)

        The window of data continuously slides to the right, and is
        slid back to FBuffer whenever FStart reaches a point 16KB
        away, this point being given by FSlidePoint.


        The hash table:
        This is a chained hash table with some peculiarities. First
        the table itself, FHashHeads. It contains pointers to strings
        in the window buffer, not to chains. The chains are held is a
        separate structure, FHashChains. The hash function on the
        three-character keys is a Rabin-Karp function:
          ((((Ch1 shl 5) xor Ch2) shl 5) xor Ch3) and $3FFF
        designed so that a running hash value can be kept and
        calculated per character. The hash table is $4000 elements
        long (obviously, given the hash function).
        On insertion, the previous pointer in the hash table at the
        calculated index is saved and replaced by the new pointer. The
        old pointer is saved in the chains array. This has the same
        number of elements as the sliding window has characters. The
        pointer is placed at (Ptr and (WindowsSize-1)) overwriting the
        value that's already there. In this fashion the individual
        chains in the standard hash table are interwoven with each
        other in this hash table, like a skein of threads.
        }

const
  c_HashCount = $4000;             {the number of hash entries}
  c_HashMask  = c_HashCount - 1;   {a mask for the hash function}
  c_HashShift = 5;                 {shift value for the hash function}

{===TAbDfInputWindow=================================================}
constructor TAbDfInputWindow.Create(aStream       : TStream;
                                    aStreamSize   : Int64;
                                    aWinSize      : integer;
                                    aChainLength  : integer;
                                    aUseDeflate64 : boolean;
                                    aUseCRC32     : boolean);
begin
  {create the ancestor}
  inherited Create;

  {save parameters}
  FStreamSize := aStreamSize;
  FWinSize := aWinSize;
  FWinMask := aWinSize - 1;
  FStream := aStream;
  FChainLen := aChainLength;
  FUseDeflate64 := aUseDeflate64;
  FUseCRC32 := aUseCRC32;
  if aUseCRC32 then
    FChecksum := -1  { CRC32 starts off with all bits set }
  else
    FCheckSum := 1;  { Adler32 starts off with a value of 1 }

  {set capacity of sliding window}
  iwSetCapacity(aWinSize);

  {create the hash table, first the hash table itself (and set all
   entries to nil)}
  FHashHeads := AllocMem(c_HashCount * sizeof(pointer));

  {..now the chains (there's no need to set the entries to nil, since
   the chain entries get fed from the head entries before searching)}
  GetMem(FHashChains, aWinSize * sizeof(pointer));

  {read the first chunk of data from the stream}
  FMustSlide := true;
  iwReadFromStream;

  {if there are at least two bytes, prime the hash index}
  if ((FLookAheadEnd - FBuffer) >= 2) then
    FHashIndex := ((longint(FBuffer[0]) shl c_HashShift) xor
                   longint(FBuffer[1])) and
                  c_HashMask;
end;
{--------}
destructor TAbDfInputWindow.Destroy;
begin
  {free the hash table}
  FreeMem(FHashHeads);
  FreeMem(FHashChains);

  {free the buffer}
  FreeMem(FBuffer);

  {destroy the ancestor}
  inherited Destroy;
end;
{--------}
procedure TAbDfInputWindow.Advance(aCount     : integer;
                                   aHashCount : integer);
var
  i : integer;
  ByteCount : integer;
  Percent   : integer;
  HashChains: PAbPointerList;
  HashHeads : PAbPointerList;
  HashInx   : integer;
  CurPos    : PAnsiChar;
begin
  Assert((FLookAheadEnd - FCurrent) >= aCount,
         'TAbDfInputWindow.Advance: seem to be advancing into the unknown');
  Assert((aHashCount = aCount) or (aHashCount = pred(aCount)),
         'TAbDfInputWindow.Advance: the parameters are plain wrong');
  {use local var for speed}
  CurPos := FCurrent;

  {advance the current pointer if needed}
  if (aCount > aHashCount) then
    inc(CurPos);

  {make sure we update the hash table; remember that the string[3] at
   the current position has already been added to the hash table (for
   notes on updating the hash table, see FindLongestMatch}

  {use local vars for speed}
  HashChains := FHashChains;
  HashHeads := FHashHeads;
  HashInx := FHashIndex;

  {update the hash table}
  for i := 0 to pred(aHashCount) do begin
    HashInx :=
       ((HashInx shl c_HashShift) xor longint(CurPos[2])) and
       c_HashMask;
    HashChains^[longint(CurPos) and FWinMask] :=
       HashHeads^[HashInx];
    HashHeads^[HashInx] := CurPos;
    inc(CurPos);
  end;

  {replace old values}
  FHashChains := HashChains;
  FHashHeads := HashHeads;
  FHashIndex := HashInx;
  FCurrent := CurPos;

  {if we've seen at least FWinSize bytes...}
  if FAdvanceStart then begin

    {advance the start of the sliding window}
    inc(FStart, aCount);
    inc(FStartOffset, aCount);

    {check to see if we have advanced into the slide zone}
    if FMustSlide and (FStart >= FSlidePoint) then
      iwSlide;
  end

  {otherwise check to see if we've seen at least FWinSize bytes}
  else if ((CurPos - FStart) >= FWinSize) then begin
    FAdvanceStart := true;
    {note: we can't advance automatically aCount bytes here, we need
           to calculate the actual count}
    ByteCount := (CurPos - FWinSize) - FStart;
    inc(FStart, ByteCount);
    inc(FStartOffset, ByteCount);
  end;

  {show progress}
  if Assigned(FOnProgress) then begin
    inc(FBytesUsed, aCount);
    if ((FBytesUsed and $FFF) = 0) then begin
      Percent := Round((100.0 * FBytesUsed) / FStreamSize);
      FOnProgress(Percent);
    end;
  end;

  {check to see if we have advanced into the slide zone}
  if (FStart >= FSlidePoint) then
    iwSlide;
end;
{--------}
procedure TAbDfInputWindow.AdvanceByOne;
var
  Percent   : integer;
begin
  {advance the current pointer}
  inc(FCurrent);

  {if we've seen at least FWinSize bytes...}
  if FAdvanceStart then begin

    {advance the start of the sliding window}
    inc(FStart, 1);
    inc(FStartOffset, 1);

    {check to see if we have advanced into the slide zone}
    if FMustSlide and (FStart >= FSlidePoint) then
      iwSlide;
  end

  {otherwise check to see if we've seen FWinSize bytes}
  else if ((FCurrent - FStart) = FWinSize) then
    FAdvanceStart := true;

  {show progress}
  if Assigned(FOnProgress) then begin
    inc(FBytesUsed, 1);
    if ((FBytesUsed and $FFF) = 0) then begin
      Percent := Round((100.0 * FBytesUsed) / FStreamSize);
      FOnProgress(Percent);
    end;
  end;
end;
{--------}
function TAbDfInputWindow.FindLongestMatch(aAmpleLength : integer;
                                       var aMatch       : TAbDfMatch;
                                     const aPrevMatch   : TAbDfMatch)
                                                        : boolean;
{Note: this routine implements a greedy algorithm and is by far the
       time sink for compression. There are two versions, one written
       in Pascal for understanding, one in assembler for speed.
       Activate one and only one of the following compiler defines.}
{$IFDEF CPU386}
  {$DEFINE UseGreedyAsm}
{$ELSE}
  {$DEFINE UseGreedyPascal}
{$ENDIF}

{Check to see that all is correct}
{$IFDEF UseGreedyAsm}
  {$IFDEF UseGreedyPascal}
    !! Compile Error: only one of the greedy compiler defines can be used
  {$ENDIF}
{$ELSE}
  {$IFNDEF UseGreedyPascal}
    !! Compile Error: one of the greedy compiler defines must be used
  {$ENDIF}
{$ENDIF}
type
  PLongint = ^longint;
  PWord    = ^word;
var
  MaxLen     : longint;
  MaxDist    : longint;
  MaxMatch   : integer;
  ChainLen   : integer;
  PrevStrPos : PAnsiChar;
  CurPos     : PAnsiChar;
  {$IFDEF UseGreedyAsm}
  CurWord    : word;
  MaxWord    : word;
  {$ENDIF}
  {$IFDEF UseGreedyPascal}
  Len        : longint;
  MatchStr   : PAnsiChar;
  CurrentCh  : PAnsiChar;
  CurCh      : AnsiChar;
  MaxCh      : AnsiChar;
  {$ENDIF}
begin
  {calculate the hash index for the current position; using the
   Rabin-Karp algorithm this is equal to the previous index less the
   effect of the character just lost plus the effect of the character
   just gained}
  CurPos := FCurrent;
  FHashIndex :=
     ((FHashIndex shl c_HashShift) xor longint(CurPos[2])) and
     c_HashMask;

  {get the head of the hash chain: this is the position in the sliding
   window of the previous 3-character string with this hash value}
  PrevStrPos := FHashHeads^[FHashIndex];

  {set the head of the hash chain equal to our current position}
  FHashHeads^[FHashIndex] := CurPos;

  {update the chain itself: set the entry for this position equal to
   the previous string position}
  FHashChains^[longint(CurPos) and FWinMask] := PrevStrPos;

  {calculate the maximum match we could do at this position}
  MaxMatch := (FLookAheadEnd - CurPos);
  if (MaxMatch > FMaxMatchLen) then
    MaxMatch := FMaxMatchLen;
  if (aAmpleLength > MaxMatch) then
    aAmpleLength := MaxMatch;

  {calculate the current match length}
  if (aPrevMatch.maLen = 0) then
    MaxLen := 2
  else begin
    if (MaxMatch < aPrevMatch.maLen) then begin
      Result := false;
      aMatch.maLen := 0;
      aMatch.maLit := CurPos^;
      Exit;
    end;
    MaxLen := aPrevMatch.maLen;
  end;

  {get the bytes at the current position and at the end of the maximum
   match we have to better}
  {$IFDEF UseGreedyAsm}
  CurWord := PWord(CurPos)^;
  MaxWord := PWord(CurPos + pred(MaxLen))^;
  {$ENDIF}
  {$IFDEF UseGreedyPascal}
  CurCh := CurPos^;
  MaxCh := (CurPos + pred(MaxLen))^;
  {$ENDIF}

  {set the chain length to search based on the current maximum match
   (basically: if we've already satisfied the ample length
   requirement, don't search as far)}
  if (MaxLen >= aAmpleLength) then
    ChainLen := FChainLen div 4
  else
    ChainLen := FChainLen;

  {get ready for the loop}
  {$IFDEF DefeatWarnings}
  MaxDist := 0;
  {$ENDIF}

  {$IFDEF UseGreedyAsm} { slip into assembler for speed...}
  asm
    push ebx                 { save those registers we should}
    push esi
    push edi

    mov ebx, Self            { ebx will store the Self pointer}
    mov edi, PrevStrPos      { edi => previous string}
    mov esi, CurPos          { esi => current string}

  @@TestThisPosition:
                             { check previous string is in range}
    or edi, edi
    je @@Exit
    cmp edi, [ebx].TAbDfInputWindow.FStart
    jb @@Exit
    cmp edi, CurPos
    jae @@Exit

    mov ax, [edi]            { check previous string starts with same}
    cmp CurWord, ax          {   two bytes as current}
    jne @@GetNextPosition    { ..nope, they don't match}

    mov edx, edi             { check previous string ends with same}
    add edi, MaxLen          {   two bytes as current (by "ends" we}
    dec edi                  {   mean the last two bytes at the}
    mov ax, [edi]            {   current match length)}
    cmp MaxWord, ax
    mov edi, edx
    jne @@GetNextPosition    { ..nope, they don't match}

    push edi                 { compare the previous string with the}
    push esi                 {   current string}
    mov eax, MaxMatch
    add edi, 2               { (we've already checked that the first}
    sub eax, 2               { two characters are the same)}
    add esi, 2
    mov ecx, eax

  @@CmpQuads:
    cmp ecx, 4
    jb @@CmpSingles

    mov edx, [esi]
    cmp edx, [edi]
    jne @@CmpSingles

    add esi, 4
    add edi, 4
    sub ecx, 4
    jnz @@CmpQuads

    jmp @@MatchCheck

  @@CmpSingles:
    or ecx, ecx
    jb @@MatchCheck

    mov dl, [esi]
    cmp dl, [edi]
    jne @@MatchCheck

    inc esi
    inc edi
    dec ecx
    jnz @@CmpSingles

  @@MatchCheck:
    sub eax, ecx
    add eax, 2
    pop esi
    pop edi

    cmp eax, MaxLen          { have we found a longer match?}
    jbe @@GetNextPosition    { ..no}
    mov MaxLen, eax          { ..yes, so save it}

    mov eax, esi             { calculate the dist for this new match}
    sub eax, edi
    mov MaxDist, eax

    cmp eax, aAmpleLength    { if this match is ample enough, exit}
    jae @@Exit

    mov eax, esi             { calculate the two bytes at the end of}
    add eax, MaxLen          {   this new match}
    dec eax
    mov ax, [eax]
    mov MaxWord, ax

  @@GetNextPosition:
    mov eax, ChainLen        { we've visited one more link on the}
    dec eax                  {   chain, if that's the last one we}
    je @@Exit                {   should visit, exit}
    mov ChainLen, eax

                             { advance along the chain}
    mov edx, [ebx].TAbDfInputWindow.FHashChains
    mov eax, [ebx].TAbDfInputWindow.FWinMask
    and edi, eax
    shl edi, 2
    mov edi, [edx+edi]
    jmp @@TestThisPosition

  @@Exit:
    pop edi
    pop esi
    pop ebx
  end;
  {$ENDIF}

  {$IFDEF UseGreedyPascal}
  {for all possible hash nodes in the chain...}
  while (FStart <= PrevStrPos) and (PrevStrPos < CurPos) do begin

    {if the initial and maximal characters match...}
    if (PrevStrPos[0] = CurCh) and
       (PrevStrPos[pred(MaxLen)] = MaxCh) then begin

      {compare more characters}
      Len := 1;
      CurrentCh := CurPos + 1;
      MatchStr := PrevStrPos + 1;

      {compare away, but don't go above the maximum length}
      while (Len < MaxMatch) and (MatchStr^ = CurrentCh^) do begin
        inc(CurrentCh);
        inc(MatchStr);
        inc(Len);
      end;

      {have we reached another maximum for the length?}
      if (Len > MaxLen) then begin
        MaxLen := Len;
        {calculate the distance}
        MaxDist := CurPos - PrevStrPos;
        MaxCh := CurPos[pred(MaxLen)];

        {is the new best length ample enough?}
        if MaxLen >= aAmpleLength then
          Break;
      end;
    end;

    {have we reached the end of this chain?}
    dec(ChainLen);
    if (ChainLen = 0) then
      Break;

    {otherwise move onto the next position}
    PrevStrPos := FHashChains^[longint(PrevStrPos) and FWinMask];
  end;
  {$ENDIF}

  {based on the results of our investigation, return the match values}
  if (MaxLen < 3) or (MaxLen <= aPrevMatch.maLen) then begin
    Result := false;
    aMatch.maLen := 0;
    aMatch.maLit := CurPos^;
  end
  else begin
    Result := true;
    aMatch.maLen := MaxLen;
    aMatch.maDist := MaxDist;
    aMatch.maLit := CurPos^; { just in case...}
  end;
end;
{--------}
function TAbDfInputWindow.GetNextChar : AnsiChar;
begin
  Result := FCurrent^;
  inc(FCurrent);
end;
{--------}
function TAbDfInputWindow.GetNextKeyLength : integer;
begin
  Result := FLookAheadEnd - FCurrent;
  if (Result > 3) then
    Result := 3;
end;
{--------}
function TAbDfInputWindow.iwGetChecksum : longint;
begin
  {the CRC32 checksum algorithm requires a post-conditioning step
   after being calculated (the result is NOTted), whereas Adler32 does
   not}
  if FUseCRC32 then
    Result := not FChecksum
  else
    Result := FChecksum;
end;
{--------}
procedure TAbDfInputWindow.iwReadFromStream;
var
  BytesRead   : longint;
  BytesToRead : longint;
begin
  {read some more data into the look ahead zone}
  BytesToRead := FBufferEnd - FLookAheadEnd;
  BytesRead := FStream.Read(FLookAheadEnd^, BytesToRead);

  {if nothing was read, we reached the end of the stream; hence
   there's no more need to slide the window since we have all the
   data}
  if (BytesRead = 0) then
    FMustSlide := false

  {otherwise something was actually read...}
  else begin
    {update the checksum}
    if FUseCRC32 then
      AbUpdateCRCBuffer(FChecksum, FLookAheadEnd^, BytesRead)
    else
      AbUpdateAdlerBuffer(FChecksum, FLookAheadEnd^, BytesRead);

    {reposition the pointer for the end of the lookahead area}
    inc(FLookAheadEnd, BytesRead);
  end;
end;
{--------}
procedure TAbDfInputWindow.iwSetCapacity(aValue : longint);
var
  ActualSize : integer;
begin
  {calculate the actual size; this will be the value passed in, plus
   the correct look ahead size, plus 16KB}
  ActualSize := aValue + (16 * 1024);
  if FUseDeflate64 then begin
    inc(ActualSize, dfc_MaxMatchLen64);
    FMaxMatchLen := dfc_MaxMatchLen64;
  end
  else begin
    inc(ActualSize, dfc_MaxMatchLen);
    FMaxMatchLen := dfc_MaxMatchLen;
  end;

  {get the new buffer}
  GetMem(FBuffer, ActualSize);

  {set the other buffer pointers}
  FStart := FBuffer;
  FCurrent := FBuffer;
  FLookAheadEnd := FBuffer;
  FBufferEnd := FBuffer + ActualSize;
  FSlidePoint := FBuffer + (16 * 1024);
end;
{--------}
procedure TAbDfInputWindow.iwSlide;
type
  PLongint = ^longint;
var
  i : integer;
  ByteCount : integer;
  Buffer    : longint;
  ListItem  : PLongint;
begin
  {move current valid data back to the start of the buffer}
  ByteCount := FLookAheadEnd - FStart;
  Move(FStart^, FBuffer^, ByteCount);

  {reset the various pointers}
  ByteCount := FStart - FBuffer;
  FStart := FBuffer;
  dec(FCurrent, ByteCount);
  dec(FLookAheadEnd, ByteCount);

  {patch up the hash table: the head pointers}
  Buffer := longint(FBuffer);
  ListItem := PLongint(@FHashHeads^[0]);
  for i := 0 to pred(c_HashCount) do begin
    dec(ListItem^, ByteCount);
    if (ListItem^ < Buffer) then
      ListItem^ := 0;
    inc(PAnsiChar(ListItem), sizeof(pointer));
  end;

  {..the chain pointers}
  ListItem  := PLongint(@FHashChains^[0]);
  for i := 0 to pred(FWinSize) do begin
    dec(ListItem^, ByteCount);
    if (ListItem^ < Buffer) then
      ListItem^ := 0;
    inc(PAnsiChar(ListItem), sizeof(pointer));
  end;

  {now read some more data from the stream}
  iwReadFromStream;
end;
{--------}
function TAbDfInputWindow.Position : longint;
begin
  Result := (FCurrent - FStart) + FStartOffset;
end;
{--------}
procedure TAbDfInputWindow.ReadBuffer(var aBuffer; aCount  : longint;
                                                   aOffset : Int64);
var
  CurPos : Int64;
begin
  CurPos := FStream.Seek(0, soCurrent);
  FStream.Seek(aOffSet, soBeginning);
  FStream.ReadBuffer(aBuffer, aCount);
  FStream.Seek(CurPos, soBeginning);
end;
{====================================================================}

end.