lasarus_compotents/viewer/unicodeutils.pas

519 lines
12 KiB
ObjectPascal

{
Most of this code is based on similar functions from Lazarus LCLProc.
}
unit UnicodeUtils;
{$mode objfpc}{$H+}
interface
uses
Classes, SysUtils;
{en
Retrieves length in bytes of the next UTF-8 character.
@param(P
Pointer to the UTF-8 characters.)
@param(aMaxBytes
States how many bytes from P can be read.)
@param(InvalidCharLen
If an invalid UTF-8 character was found then InvalidCharLen has
the number of bytes this character spans. If the character was valid
InvalidCharLen is zero.)
}
function SafeUTF8NextCharLen(P: PByte; aMaxBytes: Integer; out InvalidCharLen: Integer): Integer;
{en
Retrieves length in bytes of the previous UTF-8 character.
It does not read from P, but rather from memory locations before P.
@param(P
Pointer to the UTF-8 characters.)
@param(aMaxBytes
States how many bytes from P *backwards* can be read.
So, to safely read 3 bytes backwards ([p-1], [p-2], [p-3])
this parameter should be at least 3.)
@param(InvalidCharLen
If an invalid UTF-8 character was found then InvalidCharLen has
the number of bytes this character spans. If the character was valid
InvalidCharLen is zero.)
}
function SafeUTF8PrevCharLen(P: PByte; aMaxBytes: Integer; out InvalidCharLen: Integer): Integer;
function SafeUTF8NextCharStart(UTF8Str: PByte; Len: PtrInt): PByte;
function SafeUTF8PrevCharEnd(UTF8Str: PByte; Len: PtrInt): PByte;
{en
Returns UTF-16 character length, which is either 1 or 2.
@param(utf16char
Any UTF-16 char or one of the surrogate pairs.)
}
function UTF16CharLen(utf16char: Word): Integer;
{en
Converts an UTF-16 surrogate pair into a unicode character.
}
function utf16PairToUnicode(u1, u2: Word): Cardinal;
function Utf16LEToUtf8(const s: string): string; // UTF16-LE 2 or 4 byte little endian
function Utf16BEToUtf8(const s: string): string; // UTF16-BE 2 or 4 byte big endian
function Utf32LEToUtf8(const s: string): string; // UTF32-LE 4 byte little endian
function Utf32BEToUtf8(const s: string): string; // UTF32-BE 4 byte big endian
function Utf8ToUtf16LE(const s: string): string; // UTF16-LE 2 or 4 byte little endian
function Utf8ToUtf16BE(const s: string): string; // UTF16-BE 2 or 4 byte big endian
{en
Replaces invalid UTF-8 characters with '?'.
}
function Utf8ReplaceBroken(const s: UTF8String): UTF8String;
implementation
uses
LCLProc, LazUTF8;
const
maxUTF8Len = 7; // really is 4, but this includes any invalid characters up to length 7
function SafeUTF8NextCharLen(P: PByte; aMaxBytes: Integer; out InvalidCharLen: Integer): Integer;
var
BytesLen: Integer;
i: Integer;
begin
if (p=nil) or (aMaxBytes = 0) then
begin
InvalidCharLen := 0;
Result := 0;
end
else if p^<%10000000 then begin
// regular single byte character
InvalidCharLen := 0;
Result := 1;
end
else if p^<%11000000 then begin
// invalid single byte character
InvalidCharLen := 1;
Result := 1;
end
else
begin
// Read length of UTF-8 character in bytes.
if ((p^ and %11100000) = %11000000) then BytesLen := 2
else if ((p^ and %11110000) = %11100000) then BytesLen := 3
else if ((p^ and %11111000) = %11110000) then BytesLen := 4
else if ((p^ and %11111100) = %11111000) then BytesLen := 5
else if ((p^ and %11111110) = %11111100) then BytesLen := 6
else if ((p^ and %11111111) = %11111110) then BytesLen := 7
else
begin
InvalidCharLen := 1;
exit(1);
end;
// Check if the next bytes are from the middle of a character.
for i := 1 to BytesLen - 1 do
begin
if (aMaxBytes < i) or ((p[i] and %11000000) <> %10000000) then
begin
InvalidCharLen := i;
exit(1);
end;
end;
InvalidCharLen := 0;
Result := BytesLen;
end;
end;
function SafeUTF8PrevCharLen(P: PByte; aMaxBytes: Integer; out InvalidCharLen: Integer): Integer;
var
BytesLen: Integer;
signature: Byte;
begin
if (p=nil) or (aMaxBytes = 0) then
begin
InvalidCharLen := 0;
Result := 0;
end
else if p[-1]<%10000000 then begin
// regular single byte character
InvalidCharLen := 0;
Result := 1;
end
else
begin
for BytesLen := 1 to maxUTF8Len do
begin
if (aMaxBytes < BytesLen) then
begin
InvalidCharLen := aMaxBytes;
exit(1);
end;
// Move past all the bytes in the middle of a character.
if (p[-BytesLen] and %11000000) <> %10000000 then
break;
if BytesLen = maxUTF8Len then
begin
InvalidCharLen := BytesLen;
exit(1);
end;
end;
if p[-BytesLen]<%11000000 then
begin
// invalid first byte of a character
InvalidCharLen := BytesLen;
Result := 1;
end
else
begin
signature := Byte($FF shl (7 - BytesLen));
if (p[-BytesLen] and signature) = Byte(signature shl 1) then
begin
// Correct first byte of a character.
InvalidCharLen := 0;
Result := BytesLen;
end
else
begin
// Invalid first byte of a character, or p is in the middle of a character.
InvalidCharLen := BytesLen;
Result := 1;
end;
end;
end;
end;
function SafeUTF8NextCharStart(UTF8Str: PByte; Len: PtrInt): PByte;
var
CharLen: LongInt;
InvalidCharLen: Integer;
begin
Result:=UTF8Str;
if Result<>nil then begin
while (Len>0) do begin
CharLen := SafeUTF8NextCharLen(Result, Len, InvalidCharLen);
if InvalidCharLen > 0 then
begin
dec(Len,InvalidCharLen);
inc(Result,InvalidCharLen);
end
else if CharLen = 0 then
exit(nil)
else
exit(Result);
end;
Result:=nil;
end;
end;
function SafeUTF8PrevCharEnd(UTF8Str: PByte; Len: PtrInt): PByte;
var
CharLen: LongInt;
InvalidCharLen: Integer;
begin
Result:=UTF8Str;
if Result<>nil then begin
while (Len>0) do begin
CharLen := SafeUTF8PrevCharLen(Result, Len, InvalidCharLen);
if InvalidCharLen > 0 then
begin
dec(Len,InvalidCharLen);
dec(Result,InvalidCharLen);
end
else if CharLen = 0 then
exit(nil)
else
exit(Result); // Result is the character beginning
end;
Result:=nil;
end;
end;
function UTF16CharLen(utf16char: Word): Integer; inline;
begin
if (utf16char < $D800) or (utf16char > $DFFF) then
Result := 1
else
Result := 2;
end;
function utf16PairToUnicode(u1, u2: Word): Cardinal;
begin
if (u1 >= $D800) and (u1 <= $DBFF) then
begin
if (u2 >= $DC00) and (u2 <= $DFFF) then
Result := (Cardinal(u1 - $D800) shl 10) + Cardinal(u2 - $DC00) + $10000
else
Result := 0;
end
else
Result := u1;
end;
function Utf16LEToUtf8(const s: string): string;
var
len: Integer;
Src, Limit: PWord;
Dest: PAnsiChar;
u: Cardinal;
begin
if Length(s) < 2 then begin
Result:='';
exit;
end;
Src:=PWord(Pointer(s));
Limit := PWord(Pointer(Src) + Length(s));
SetLength(Result, length(s) * 2);
Dest:=PAnsiChar(Result);
while Src + 1 <= Limit do begin
len := UTF16CharLen(Src^);
if len = 1 then
u := LEtoN(Src^)
else
begin
if Src + 2 <= Limit then
u := utf16PairToUnicode(LEtoN(Src[0]), LEtoN(Src[1]))
else
break;
end;
inc(Src, len);
if u<128 then begin
Dest^:=chr(u);
inc(Dest);
end else begin
inc(Dest,UnicodeToUTF8SkipErrors(u,Dest));
end;
end;
len:=PtrUInt(Dest)-PtrUInt(Result);
if len>length(Result) then
RaiseGDBException('');
SetLength(Result,len);
end;
function Utf16BEToUtf8(const s: string): string;
var
len: Integer;
Src, Limit: PWord;
Dest: PAnsiChar;
u: Cardinal;
begin
if Length(s) < 2 then begin
Result:='';
exit;
end;
Src:=PWord(Pointer(s));
Limit := PWord(Pointer(Src) + Length(s));
SetLength(Result, length(s) * 2);
Dest:=PAnsiChar(Result);
while Src + 1 <= Limit do begin
len := UTF16CharLen(Src^);
if len = 1 then
u := BEtoN(Src^)
else
begin
if Src + 2 <= Limit then
u := utf16PairToUnicode(BEtoN(Src[0]), BEtoN(Src[1]))
else
break;
end;
inc(Src, len);
if u<128 then begin
Dest^:=chr(u);
inc(Dest);
end else begin
inc(Dest,UnicodeToUTF8SkipErrors(u,Dest));
end;
end;
len:=PtrUInt(Dest)-PtrUInt(Result);
if len>length(Result) then
RaiseGDBException('');
SetLength(Result,len);
end;
function Utf32LEToUtf8(const s: string): string;
var
len: Integer;
Src: PLongWord;
Dest: PAnsiChar;
i: Integer;
c: LongWord;
begin
if Length(s) < 4 then begin
Result:='';
exit;
end;
len:=length(s) div 4;
SetLength(Result,len*4);
Src:=PLongWord(Pointer(s));
Dest:=PAnsiChar(Result);
for i:=1 to len do begin
c:=LEtoN(Src^);
inc(Src);
if c<128 then begin
Dest^:=chr(c);
inc(Dest);
end else begin
inc(Dest,UnicodeToUTF8SkipErrors(c,Dest));
end;
end;
len:=PtrUInt(Dest)-PtrUInt(Result);
if len>length(Result) then
RaiseGDBException('');
SetLength(Result,len);
end;
function Utf32BEToUtf8(const s: string): string;
var
len: Integer;
Src: PLongWord;
Dest: PAnsiChar;
i: Integer;
c: LongWord;
begin
if Length(s) < 4 then begin
Result:='';
exit;
end;
len:=length(s) div 4;
SetLength(Result,len*4);
Src:=PLongWord(Pointer(s));
Dest:=PAnsiChar(Result);
for i:=1 to len do begin
c:=BEtoN(Src^);
inc(Src);
if c<128 then begin
Dest^:=chr(c);
inc(Dest);
end else begin
inc(Dest,UnicodeToUTF8SkipErrors(c,Dest));
end;
end;
len:=PtrUInt(Dest)-PtrUInt(Result);
if len>length(Result) then
RaiseGDBException('');
SetLength(Result,len);
end;
function Utf8ToUtf16LE(const s: string): string;
var
P: PWord;
I, L: SizeUInt;
begin
if Length(S) = 0 then
begin
Result := '';
Exit;
end;
// Wide chars of UTF-16 <= bytes of UTF-8 string
SetLength(Result, Length(S) * SizeOf(WideChar));
if ConvertUTF8ToUTF16(PWideChar(PAnsiChar(Result)), Length(Result) + SizeOf(WideChar),
PAnsiChar(S), Length(S), [toInvalidCharToSymbol], L) <> trNoError
then
Result := ''
else
begin
SetLength(Result, (L - 1) * SizeOf(WideChar));
// Swap endian if needed
if (NtoLE($FFFE) <> $FFFE) then
begin
P := PWord(PAnsiChar(Result));
for I := 0 to L - 1 do
begin
P[I] := SwapEndian(P[I]);
end;
end;
end;
end;
function Utf8ToUtf16BE(const s: string): string;
var
P: PWord;
I, L: SizeUInt;
begin
if Length(S) = 0 then
begin
Result := '';
Exit;
end;
// Wide chars of UTF-16 <= bytes of UTF-8 string
SetLength(Result, Length(S) * SizeOf(WideChar));
if ConvertUTF8ToUTF16(PWideChar(PAnsiChar(Result)), Length(Result) + SizeOf(WideChar),
PAnsiChar(S), Length(S), [toInvalidCharToSymbol], L) <> trNoError
then
Result := ''
else
begin
SetLength(Result, (L - 1) * SizeOf(WideChar));
// Swap endian if needed
if (NtoBE($FEFF) <> $FEFF) then
begin
P := PWord(PAnsiChar(Result));
for I := 0 to L - 1 do
begin
P[I] := SwapEndian(P[I]);
end;
end;
end;
end;
function Utf8ReplaceBroken(const s: UTF8String): UTF8String;
var
Src, Dst, LastGoodPos: PByte;
BytesLeft: Integer;
InvalidCharLen: Integer;
CharLen: Integer;
begin
if Length(s) = 0 then
Exit(s);
BytesLeft := Length(s);
SetLength(Result, BytesLeft); // at most the same length
Src := PByte(s);
Dst := PByte(Result);
LastGoodPos := Src;
while BytesLeft > 0 do
begin
CharLen := SafeUTF8NextCharLen(Src, BytesLeft, InvalidCharLen);
if InvalidCharLen > 0 then
begin
if LastGoodPos < Src then
begin
System.Move(LastGoodPos^, Dst^, Src - LastGoodPos);
Inc(Dst, Src - LastGoodPos);
end;
Inc(Src, InvalidCharLen);
Dec(BytesLeft, InvalidCharLen);
LastGoodPos := Src;
Dst^ := ord('?');
Inc(Dst);
end
else
begin
Inc(Src, CharLen);
Dec(BytesLeft, CharLen);
end;
end;
if LastGoodPos = PByte(s) then
Result := s // All characters are good.
else
begin
if LastGoodPos < Src then
begin
System.Move(LastGoodPos^, Dst^, Src - LastGoodPos);
Inc(Dst, Src - LastGoodPos);
end;
SetLength(Result, Dst - PByte(Result));
end;
end;
end.