lasarus_compotents/bgrabitmap/dev/parseunicode/parseunicodeclasses.lpr

772 lines
26 KiB
ObjectPascal
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// SPDX-License-Identifier: LGPL-3.0-only (modified to allow linking)
program parseunicodeclasses;
uses Classes, sysutils, fgl, LazUTF8;
type
TIntegerList = specialize TFPGList<Integer>;
var
UnicodeData: array of record
Code: LongInt;
Name, Category: string;
CombiningClass: byte;
BidiClass, Decomposition: string;
Mirrored: boolean;
OldName: string;
end;
UnicodeCount: integer;
procedure LoadUnicodeData;
var
lines, cells: TStringList;
i: Integer;
begin
lines := TStringList.Create;
lines.LoadFromFile('UnicodeData.txt');
setlength(UnicodeData, lines.Count);
UnicodeCount := 0;
cells := TStringList.Create;
cells.Delimiter := ';';
cells.QuoteChar := '"';
cells.StrictDelimiter := true;
for i := 0 to lines.Count-1 do
begin
cells.DelimitedText := lines[i];
if cells.Count >= 11 then
with UnicodeData[UnicodeCount] do
begin
Code := StrToInt('$'+cells[0]);
Name := cells[1];
Category := cells[2];
CombiningClass:= StrToInt(cells[3]);
BidiClass := cells[4];
Decomposition:= cells[5];
Mirrored := (cells[9] = 'Y');
OldName := cells[10];
inc(UnicodeCount);
end;
end;
SetLength(UnicodeData, unicodeCount);
lines.Free;
end;
function IndexOfUnicode(u: LongInt): integer;
var
low, high, mid: Integer;
begin
low := 0;
high := UnicodeCount-1;
while low < high do
begin
mid := (low+high) div 2;
if u > UnicodeData[mid].Code then
low := mid+1
else
high := mid;
end;
if UnicodeData[low].Code = u then
result := low
else
result := -1;
end;
function GetUnicodeBidiClass(u: LongInt): string;
var
idx: Integer;
begin
idx := IndexOfUnicode(u);
if idx = -1 then
result := ''
else result := UnicodeData[idx].BidiClass;
end;
function GetUnicodeCombiningClass(u: LongInt): byte;
var
idx: Integer;
begin
idx := IndexOfUnicode(u);
if idx = -1 then
result := 0
else result := UnicodeData[idx].CombiningClass;
end;
function UnicodeCharToUTF8(u: LongInt): string;
begin
if u >= 0 then
result := UnicodeToUTF8(cardinal(u))
else
result := '';
end;
function ArrayOfCodeToCase(ACodes: array of integer; AIndent: string): string;
var
codeCount, i: Integer;
bufLines: TStringList;
buf: String;
begin
codeCount := length(ACodes);
if codeCount = 0 then exit('');
bufLines := TStringList.Create;
i := 0;
buf := AIndent+' ';
while i < codeCount do
begin
if i > 0 then buf += ', ';
if length(buf) > 95 then
begin
bufLines.Add(buf);
buf := AIndent+' ';
end;
if (i+2 < codeCount) and (ACodes[i]+1 = ACodes[i+1]) and (ACodes[i+1]+1 = ACodes[i+2]) then
begin
buf += '$'+IntToHex(ACodes[i],2) + '..';
while (i+1 < codeCount) and (ACodes[i]+1 = ACodes[i+1]) do inc(i);
buf += '$'+IntToHex(ACodes[i],2);
end else
buf += '$'+IntToHex(ACodes[i],2);
inc(i);
end;
if trim(buf) <> '' then bufLines.Add(buf);
result := '';
for i := 0 to bufLines.Count-1 do
begin
if i > 0 then result += LineEnding;
result += bufLines[i];
end;
bufLines.Free;
result += ': ';
end;
function ArrayOfCodeToCase(ACodes: TIntegerList; AIndent: string): string;
var a: array of integer;
i: Integer;
begin
setlengtH(a, ACodes.Count);
for i := 0 to high(a) do
a[i] := ACodes[i];
result := ArrayOfCodeToCase(a, AIndent);
end;
procedure GenerateUnicodeFunctions;
const Indent = ' ';
var
tIn, tOut: TextFile;
procedure ParseBidiClasses;
type
TUnicodeBidiClass = (ubcBoundaryNeutral, ubcSegmentSeparator, ubcParagraphSeparator, ubcWhiteSpace, ubcOtherNeutrals,
ubcCommonSeparator, ubcNonSpacingMark,
ubcLeftToRight, ubcEuropeanNumber, ubcEuropeanNumberSeparator, ubcEuropeanNumberTerminator,
ubcRightToLeft, ubcArabicLetter, ubcArabicNumber,
ubcUnknown,
ubcCombiningLeftToRight, //ubcLeftToRight in Mc category
ubcMirroredNeutral); //ubcOtherNeutrals with Mirrored property
procedure IncludeClasses(AClasses: TStrings; AMinCode, AMaxCode: integer);
const
MaxGapsPerClass = 20;
var
codes: array[TUnicodeBidiClass] of TIntegerList;
gaps: array[TUnicodeBidiClass] of integer;
procedure FlushCase(curBidi: TUnicodeBidiClass);
var
caseStr: string;
begin
if codes[curBidi].Count = 0 then exit;
caseStr := ArrayOfCodeToCase(codes[curBidi], Indent);
case curBidi of
ubcCommonSeparator: WriteLn(tOut,caseStr+'result := ubcCommonSeparator;');
ubcLeftToRight: WriteLn(tOut,caseStr+'result := ubcLeftToRight;');
ubcCombiningLeftToRight: WriteLn(tOut,caseStr+'result := ubcCombiningLeftToRight;');
ubcEuropeanNumber: WriteLn(tOut,caseStr+'result := ubcEuropeanNumber;');
ubcEuropeanNumberSeparator: WriteLn(tOut,caseStr+'result := ubcEuropeanNumberSeparator;');
ubcEuropeanNumberTerminator: WriteLn(tOut,caseStr+'result := ubcEuropeanNumberTerminator;');
ubcRightToLeft: WriteLn(tOut,caseStr+'result := ubcRightToLeft;');
ubcArabicLetter: WriteLn(tOut,caseStr+'result := ubcArabicLetter;');
ubcArabicNumber: WriteLn(tOut,caseStr+'result := ubcArabicNumber;');
ubcNonSpacingMark: WriteLn(tOut,caseStr+'result := ubcNonSpacingMark;');
ubcBoundaryNeutral: WriteLn(tOut,caseStr+'result := ubcBoundaryNeutral;');
ubcParagraphSeparator: WriteLn(tOut,caseStr+'result := ubcParagraphSeparator;');
ubcSegmentSeparator: WriteLn(tOut,caseStr+'result := ubcSegmentSeparator;');
ubcWhiteSpace: WriteLn(tOut,caseStr+'result := ubcWhiteSpace;');
ubcMirroredNeutral: WriteLn(tOut,caseStr+'result := ubcMirroredNeutral;');
ubcOtherNeutrals: WriteLn(tOut,caseStr+'result := ubcOtherNeutrals;');
else raise exception.Create('Unknown bidi class');
end;
codes[curBidi].Clear;
gaps[curBidi] := 0;
end;
var
newBidi: TUnicodeBidiClass;
curCode: LongInt;
i: integer;
begin
write(' ', AClasses.DelimitedText);
for newBidi := low(TUnicodeBidiClass) to high(TUnicodeBidiClass) do
begin
codes[newBidi] := TIntegerList.Create;
gaps[newBidi] := 0;
end;
for i := 0 to UnicodeCount-1 do
begin
case UnicodeData[i].BidiClass of
'CS': newBidi := ubcCommonSeparator;
'L': newBidi := ubcLeftToRight;
'EN': newBidi := ubcEuropeanNumber;
'ES': newBidi := ubcEuropeanNumberSeparator;
'ET': newBidi := ubcEuropeanNumberTerminator;
'R': newBidi := ubcRightToLeft;
'AL': newBidi := ubcArabicLetter;
'AN': newBidi := ubcArabicNumber;
'NSM': newBidi := ubcNonSpacingMark;
'BN': newBidi := ubcBoundaryNeutral;
'B': newBidi := ubcParagraphSeparator;
'S': newBidi := ubcSegmentSeparator;
'WS': newBidi := ubcWhiteSpace;
'ON': newBidi := ubcOtherNeutrals;
else continue;
end;
if (newBidi = ubcLeftToRight) and (UnicodeData[i].Category = 'Mc') then newBidi := ubcCombiningLeftToRight
else if (newBidi = ubcOtherNeutrals) and UnicodeData[i].Mirrored then newBidi := ubcMirroredNeutral;
if AClasses.IndexOf(UnicodeData[i].BidiClass)<>-1 then
begin
curCode := UnicodeData[i].Code;
if (curCode >= AMinCode) and (curCode <= AMaxCode) then
begin
if (codes[newBidi].Count > 0) and (codes[newBidi].Last+1 <> curCode) then
inc(gaps[newBidi]);
codes[newBidi].Add(curCode);
if gaps[newBidi] > MaxGapsPerClass then
FlushCase(newBidi);
end;
end;
end;
for newBidi := low(TUnicodeBidiClass) to high(TUnicodeBidiClass) do
begin
FlushCase(newBidi);
codes[newBidi].Free;
end;
end;
var c: TStringList;
procedure Include(AMinCode,AMaxCode: integer);
begin
write('Classes from ',IntToHex(AMinCode,2),' to ',IntToHex(AMaxCode,2),':');
Writeln(tOut,Indent+'case u of');
c.CommaText := 'BN';
IncludeClasses(c, AMinCode,AMaxCode);
c.CommaText := 'S';
IncludeClasses(c, AMinCode,AMaxCode);
c.CommaText := 'B';
IncludeClasses(c, AMinCode,AMaxCode);
c.CommaText := 'WS';
IncludeClasses(c, AMinCode,AMaxCode);
c.CommaText := 'L,CL,R,AL';
IncludeClasses(c, AMinCode,AMaxCode);
c.CommaText := 'EN';
IncludeClasses(c, AMinCode,AMaxCode);
c.CommaText := 'ES';
IncludeClasses(c, AMinCode,AMaxCode);
c.CommaText := 'ET';
IncludeClasses(c, AMinCode,AMaxCode);
c.CommaText := 'AN';
IncludeClasses(c, AMinCode,AMaxCode);
c.CommaText := 'CS,NSM';
IncludeClasses(c, AMinCode,AMaxCode);
c.CommaText := 'ON,MN';
IncludeClasses(c, AMinCode,AMaxCode);
writeln(tout,Indent+'else result := ubcUnknown;');
writeln(tout,Indent+'end;');
writeln;
end;
begin
Writeln(tOut,'function GetUnicodeBidiClassEx(u: LongWord): TUnicodeBidiClass;');
Writeln(tOut,'begin');
c := TStringList.Create;
writeln(tOut,' case u of');
writeln(tOut,' $00000..$07FFF:');
writeln(tOut,' case u of');
writeln(tOut,' $00000..$003FF:');
Include($00000, $003FF);
writeln(tOut,' $00400..$007FF:');
Include($00400, $007FF);
writeln(tOut,' $00800..$00BFF:');
Include($00800, $00BFF);
writeln(tOut,' $00C00..$00FFF:');
Include($00C00, $00FFF);
writeln(tOut,' $01000..$017FF:');
Include($01000, $017FF);
writeln(tOut,' $01800..$01FFF:');
Include($01800, $01FFF);
writeln(tOut,' $02000..$02FFF:');
Include($02000, $02FFF);
writeln(tOut,' else');
Include($03000, $07FFF);
writeln(tOut,' end;');
writeln(tOut,' $08000..$0BFFF:');
Include($08000, $0BFFF);
writeln(tOut,' $0C000..$0FFFF:');
Include($0C000, $0FFFF);
writeln(tOut,' else');
writeln(tOut,' case u of');
writeln(tOut,' $10000..$107FF:');
Include($10000, $107FF);
writeln(tOut,' $10800..$10FFF:');
Include($10800, $10FFF);
writeln(tOut,' $11000..$117FF:');
Include($11000, $117FF);
writeln(tOut,' $11800..$17FFF:');
Include($11800, $17FFF);
writeln(tOut,' $18000..$1DFFF:');
Include($18000, $1DFFF);
writeln(tOut,' $1E000..$FFFFF:');
Include($1E000, $FFFFF);
writeln(tOut,' else result := ubcUnknown;');
writeln(tOut,' end');
writeln(tOut,' end');
c.Free;
writeln(tout,'end;');
writeln(tout);
end;
procedure ParseBidiBrackets;
var elem: TStringList;
line: string;
tIn: TextFile;
begin
Writeln(tOut,'function GetUnicodeBracketInfo(u: LongWord): TUnicodeBracketInfo;');
Writeln(tOut,' procedure Bracket(AOpening,AClosing: LongWord);');
Writeln(tOut,' begin');
Writeln(tOut,' result.IsBracket := true;');
Writeln(tOut,' result.OpeningBracket := AOpening;');
Writeln(tOut,' result.ClosingBracket := AClosing;');
Writeln(tOut,' end;');
Writeln(tOut,'begin');
Writeln(tOut,' case u of');
writeln('Parsing bracket data...');
assignfile(tIn, 'BidiBrackets.txt');
reset(tin);
elem := TStringList.Create;
elem.Delimiter := ';';
elem.StrictDelimiter:= true;
while not eof(tin) do
begin
readln(tin, line);
elem.DelimitedText:= line;
if elem.Count >= 3 then
begin
if copy(trim(elem[2]),1,1) = 'o' then
writeln(tOut,' $'+trim(elem[0])+', $'+trim(elem[1])+': Bracket($'+trim(elem[0])+', $'+trim(elem[1])+');');
end;
end;
elem.Free;
closefile(tin);
writeln(tout,' else');
writeln(tout,' begin');
writeln(tout,' result.IsBracket := false;');
writeln(tout,' result.OpeningBracket := 0;');
writeln(tout,' result.ClosingBracket := 0;');
writeln(tout,' end;');
Writeln(tOut,' end;');
Writeln(tOut,'end;');
Writeln(tOut);
end;
procedure ParseArabicLigature;
var
line: string;
cells: TStringList;
chars: TStringList;
u: LongInt;
j: Integer;
procedure AddJoiningType(joinType: string; joinTypeEnum: string; AIndent: string; AMinIndex,AMaxIndex: integer);
var
i,nb: Integer;
charsList: array of integer;
begin
nb := 0;
for i := AMinIndex to AMaxIndex do
if chars.ValueFromIndex[i]=joinType then inc(nb);
if nb = 0 then exit;
setlength(charsList, nb);
nb := 0;
for i := AMinIndex to AMaxIndex do
if chars.ValueFromIndex[i]=joinType then
begin
charsList[nb] := StrToInt('$'+chars.Names[i]);
inc(nb);
end;
writeln(tOut,ArrayOfCodeToCase(charsList, AIndent)+'result := '+joinTypeEnum+';');
end;
procedure AddJoiningTypeRange(AMinIndex,AMaxIndex: integer; AIndent: string; AForceCase: boolean = false);
const MaxGaps = 45;
var
mid, i, gaps, halfGaps: Integer;
begin
gaps := 0;
for i := AMinIndex+1 to AMaxIndex do
if (StrToInt('$'+chars.Names[i])-StrToInt('$'+chars.Names[i-1]) > 1) or
(chars.ValueFromIndex[i] <> chars.ValueFromIndex[i-1]) then inc(gaps);
if (gaps > MaxGaps) and not AForceCase then
begin
halfGaps := 0;
mid := (AMinIndex+AMaxIndex) div 2;
for i := AMinIndex+1 to AMaxIndex do
if (StrToInt('$'+chars.Names[i])-StrToInt('$'+chars.Names[i-1]) > 1) or
(chars.ValueFromIndex[i] <> chars.ValueFromIndex[i-1]) then
begin
inc(halfGaps);
if halfGaps >= gaps shr 1 then
begin
mid := i;
break;
end;
end;
if gaps <= MaxGaps*2.5 then
begin
writeln(tOut,AIndent, 'if u <= $', chars.Names[mid],' then');
AddJoiningTypeRange(AMinIndex, mid, AIndent+' ', true);
writeln(tOut,AIndent, 'else');
AddJoiningTypeRange(mid+1, AMaxIndex, AIndent+' ', true);
end else
begin
writeln(tOut,AIndent, 'if u <= $', chars.Names[mid],' then begin');
AddJoiningTypeRange(AMinIndex, mid, AIndent+' ');
writeln(tOut,AIndent, 'end else begin');
AddJoiningTypeRange(mid+1, AMaxIndex, AIndent+' ');
writeln(tOut,AIndent, 'end');
end;
end else
begin
writeln(tOut,AIndent, 'case u of');
AddJoiningType('T', 'ujtTransparent', AIndent, AMinIndex, AMaxIndex);
AddJoiningType('R', 'ujtRightJoining', AIndent, AMinIndex, AMaxIndex);
AddJoiningType('L', 'ujtLeftJoining', AIndent, AMinIndex, AMaxIndex);
AddJoiningType('D', 'ujtDualJoining', AIndent, AMinIndex, AMaxIndex);
AddJoiningType('C', 'ujtJoinCausing', AIndent, AMinIndex, AMaxIndex);
writeln(tOut,AIndent, 'end');
end;
end;
begin
writeln('Parsing arabic ligature data...');
chars := TStringList.Create;
for j := 0 to UnicodeCount-1 do
begin
if (UnicodeData[j].Category = 'Mn') or (UnicodeData[j].Category = 'Me')
or (UnicodeData[j].Category = 'Cf') then
chars.Values[IntToHex(UnicodeData[j].Code,6)] := 'T';
end;
assignfile(tIn, 'ArabicShaping.txt');
reset(tIn);
cells := TStringList.Create;
cells.Delimiter := ';';
cells.QuoteChar := '"';
cells.StrictDelimiter := true;
while not eof(tIn) do
begin
readln(tIn, line);
if (line = '') or (line[1]='#') then continue;
cells.DelimitedText:= line;
if cells.Count >= 4 then
begin
u := StrToInt('$'+cells[0]);
if trim(cells[2]) = 'U' then
begin
j := chars.IndexOfName(IntToHex(u,6));
if j <> -1 then
chars.Delete(j);
end
else
chars.Values[IntToHex(u,6)] := trim(cells[2]);
end;
end;
closefile(tIn);
cells.Free;
chars.Sort;
Writeln(tOut,'function GetUnicodeJoiningType(u: LongWord): TUnicodeJoiningType;');
Writeln(tOut,'begin');
writeln(tOut,' result := ujtNonJoining;');
AddJoiningTypeRange(0, chars.Count-1, ' ');
chars.Free;
Writeln(tOut,'end;');
Writeln(tOut);
end;
procedure ParseCombiningClasses;
const
CombineLeftOnly = '093F,094E,' + {DEVANAGARI}
'09BF,09C7,09C8,' + {BENGALI}
'0A3F,' + {GURMUKHI}
'0ABF,' + {GUJARATI}
'0B47,0B48,0B4B,0B4C,' + {ORIYA}
'0BC6,0BC7,0BC8,' + {TAMIL}
'0D46,0D47,0D48,' + {MALAYALAM}
'0DD9,0DDA,0DDB,0DDC,0DDD,0DDE,' + {SINHALA}
'1031,103C,1084,' + {MYANMAR}
'17BE,17C1,17C2,17C3,' + {KHMER}
'1A19,' + {BUGINESE}
'1B3E,1B3F,' + {BALINESE}
'302E,302F,' + {HANGUL}
'A9BA,A9BB,A9BF,' + {JAVANESE}
'AA2F,AA30,AA34,'; {CHAM}
CombineLeftAndRight = '09CB,09CC,' + {BENGALI}
'0BCA,0BCB,0BCC,' + {TAMIL}
'0D4A,0D4B,0D4C,' + {MALAYALAM}
'17BF,17C0,17C4,17C5,' + {KHMER}
'1B3D,1B40,1B41,'; {BALINESE}
var
i: Integer;
infos: TStringList;
u: LongInt;
c: byte;
s: String;
procedure FlushLine;
begin
writeln(tOut, s);
s := ' ';
end;
begin
infos := TStringList.Create;
for i := 0 to UnicodeCount-1 do
begin
u := UnicodeData[i].Code;
if (UnicodeData[i].BidiClass = 'NSM') or
(UnicodeData[i].Category = 'Mc') then
begin
c := UnicodeData[i].CombiningClass;
if (c = 0) and (UnicodeData[i].Category = 'Mc') then
begin
if pos(IntToHex(u,4)+',', CombineLeftOnly) <> 0 then c := 208
else if pos(IntToHex(u,4)+',', CombineLeftAndRight) <> 0 then c := 0
else c := 210;
end;
infos.Add('(u:$'+IntToHex(u,2)+'; c:'+IntToStr(c)+')');
end;
end;
writeln(tOut,'type');
writeln(tOut,' TUnicodeCombiningInfo = record');
writeln(tOut,' u: LongWord;');
writeln(tOut,' c: Byte;');
writeln(tOut,' end;');
writeln(tOut,'const');
writeln(tOut,' UnicodeCombiningInfos: array[0..',infos.count-1,'] of TUnicodeCombiningInfo =');
s := ' (';
for i := 0 to infos.Count-1 do
begin
if length(s) + length(infos[i]) + 2 > 80 then FlushLine;
AppendStr(s, ' ' + infos[i]);
if i < infos.Count-1 then AppendStr(s, ',');
end;
if s <> ' ' then FlushLine;
writeln(tOut,' );');
writeln(tOut);
infos.Free;
end;
begin
AssignFile(tOut, 'generatedunicode.inc');
Rewrite(tOut);
writeln(tOut,'{ This file is generated by dev/parseunicode/parseunicodeclasses program }');
Writeln(tOut);
ParseBidiClasses;
ParseBidiBrackets;
ParseArabicLigature;
ParseCombiningClasses;
CloseFile(tOut);
end;
function ListCompareBinary(List: TStringList; Index1, Index2: Integer): Integer;
begin
result := CompareStr(List[Index1], List[Index2]);
end;
procedure ParseUTF8Decomposition;
type TDecompositionKind = string;
const dMultichar = 'arNone';
dInitial = 'arInitial';
dMedial = 'arMedial';
dFinal = 'arFinal';
dIsolated = 'arIsolated';
const UTF8_ARABIC_ALEPH = 'ا';
UTF8_ARABIC_ALEPH_HAMZA_BELOW = 'إ';
UTF8_ARABIC_ALEPH_HAMZA_ABOVE = 'أ';
UTF8_ARABIC_ALEPH_MADDA_ABOVE = 'آ';
UTF8_ARABIC_LAM = 'ل';
var tOut: TextFile;
decomposed, kind, decomposedUTF8, s: string;
decomposedFirstChar: LongInt;
mergedU,nextU, fallbackU: LongInt;
posClose, posSpace: SizeInt;
hasNSM, isLa: Boolean;
correspList: TStringList;
kerningFallback: TStringList;
i, decomposedLen, j: Integer;
typedKind: TDecompositionKind;
hasMarkLeft, hasMarkRight: boolean;
function RemoveUptoTab(AText: string): string;
var
idxTab: SizeInt;
begin
idxTab := pos(#9, AText);
result := copy(AText, idxTab+1, length(AText)-idxTab);
end;
begin
writeln('Parsing decomposition data...');
correspList := TStringList.Create;
kerningFallback := TStringList.Create;
for j := 0 to UnicodeCount-1 do
begin
mergedU := UnicodeData[j].Code;
if UnicodeData[j].BidiClass = 'NSM' then continue;
decomposed := UnicodeData[j].Decomposition;
if decomposed = '' then continue;
typedKind := dMultichar;
if decomposed[1] = '<' then
begin
posClose := pos('>', decomposed);
if posClose = 0 then continue;
kind := copy(decomposed,1,posClose);
delete(decomposed, 1, posClose);
if kind = '<initial>' then typedKind := dInitial else
if kind = '<medial>' then typedKind := dMedial else
if kind = '<final>' then typedKind := dFinal else
if kind = '<isolated>' then typedKind := dIsolated else
if (kind = '<compat>') and (mergedU >= $FB00) and (mergedU <= $FB04) then
typedKind := dMultichar
else
continue;
decomposed := trim(decomposed);
end;
decomposedUTF8 := '';
decomposedLen := 0;
decomposedFirstChar:= 0;
hasMarkLeft := false;
hasMarkRight := false;
hasNSM := false;
while decomposed <> '' do
begin
posSpace := pos(' ',decomposed);
if posSpace = 0 then posSpace := length(decomposed)+1;
nextU := strToInt('$'+copy(decomposed,1,posSpace-1));
if GetUnicodeBidiClass(nextU) = 'NSM' then hasNSM := true;
case GetUnicodeCombiningClass(nextU) of
200,208,212,218,224,228: hasMarkLeft := true;
204,210,216,222,226,232: hasMarkRight := true;
end;
if decomposedLen = 0 then decomposedFirstChar:= nextU;
AppendStr(decomposedUTF8, UnicodeCharToUTF8(nextU));
delete(decomposed, 1, posSpace);
inc(decomposedLen);
end;
isLa := (decomposedUTF8 = UTF8_ARABIC_LAM+UTF8_ARABIC_ALEPH) or
(decomposedUTF8 = UTF8_ARABIC_LAM+UTF8_ARABIC_ALEPH_HAMZA_BELOW) or
(decomposedUTF8 = UTF8_ARABIC_LAM+UTF8_ARABIC_ALEPH_HAMZA_ABOVE) or
(decomposedUTF8 = UTF8_ARABIC_LAM+UTF8_ARABIC_ALEPH_MADDA_ABOVE);
if ((typedKind = dMultichar) and (decomposedLen > 1)
and (hasNSM or (copy(decomposedUTF8,1,1) = 'f'))) or
((typedKind <> dMultichar) and ((decomposedLen = 1) or isLa)) then
correspList.Add(decomposedUTF8+#9+'('+
'de:''' + decomposedUTF8 + '''; ' +
're:''' + UnicodeCharToUTF8(mergedU) + '''; ' +
'join:' + typedKind +
')');
if (typedKind = dMultichar) and (decomposedUTF8 <> '') and not hasMarkLeft and not hasMarkRight and
((decomposedUTF8[1] in ['A'..'Z']) or (copy(decomposedUTF8,1,length('Æ')) = 'Æ') or
(copy(decomposedUTF8,1,length('Ç')) = 'Ç') or
(copy(decomposedUTF8,1,length('Г')) = 'Г') or
(copy(decomposedUTF8,1,length('Ѵ')) = 'Ѵ') or
(copy(decomposedUTF8,1,length('Ω')) = 'Ω') or
(copy(decomposedUTF8,1,length('Ө')) = 'Ө')) then
begin
fallbackU := decomposedFirstChar;
if fallbackU <> 32 then
kerningFallback.Add('(u:$' + inttohex(mergedU,2)+'; fb:$'+ inttohex(fallbackU,2)+')');
end;
end;
AssignFile(tOut, 'generatedutf8.inc');
Rewrite(tOut);
writeln(tOut,'{ This file is generated by dev/parseunicode/parseunicodeclasses program }');
writeln(tOut, 'type');
writeln(tOut, ' TArabicJoin = (arNone, arInitial, arMedial, arFinal, arIsolated);');
writeln(tOut, ' TUTF8Decomposition = record');
writeln(tOut, ' de, re: string; //decomposed, recomposed UTF8');
writeln(tOut, ' join: TArabicJoin;');
writeln(tOut, ' end;');
writeln(tOut, 'const');
writeln(tOut, ' UTF8Decomposition : array[0..', correspList.Count-1, '] of TUTF8Decomposition = (');
correspList.CustomSort(@ListCompareBinary);
for i := 0 to correspList.Count-1 do
if i <> correspList.Count-1 then
writeln(tOut, ' ', RemoveUptoTab(correspList[i]), ',')
else
writeln(tOut, ' ', RemoveUptoTab(correspList[i]));
correspList.Free;
writeln(tOut, ' );');
writeln(tout);
CloseFile(tOut);
AssignFile(tOut, 'generatedkerningfallback.inc');
Rewrite(tOut);
writeln(tOut,'{ This file is generated by dev/parseunicode/parseunicodeclasses program }');
writeln(tOut, 'type');
writeln(tOut, ' TKerningFallbackInfo = record');
writeln(tOut, ' u: integer; //composed charcode');
writeln(tOut, ' fb: integer; //fallback code');
writeln(tOut, ' end;');
writeln(tOut, 'const');
writeln(tOut, ' KerningFallbackInfo : array[0..', kerningFallback.Count-1, '] of TKerningFallbackInfo = (');
s := '';
for i := 0 to kerningFallback.Count-1 do
begin
if i <> kerningFallback.Count-1 then
AppendStr(s, kerningFallback[i] + ', ')
else
AppendStr(s, kerningFallback[i]);
if length(s) > 70 then
begin
writeln(tOut, ' ', s);
s := '';
end;
end;
if s <> '' then
writeln(tOut, ' ', s);
writeln(tOut, ' );');
writeln(tout);
kerningFallback.Free;
CloseFile(tOut);
end;
begin
LoadUnicodeData;
GenerateUnicodeFunctions;
ParseUTF8Decomposition;
writeln('Done.');
end.