69 lines
2.2 KiB
ObjectPascal
69 lines
2.2 KiB
ObjectPascal
|
|
//Code by Christian Ghisler (ghisler.com)
|
|
//Christian gave code to open-source at TCmd forum
|
|
|
|
unit atstringproc_utf8detect;
|
|
|
|
{$mode objfpc}{$H+}
|
|
|
|
interface
|
|
|
|
//PartialAllowed must be set to true if the buffer is smaller than the file.
|
|
function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
|
|
|
|
implementation
|
|
|
|
const bytesFromUTF8:array[AnsiChar] of byte = (
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 32
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 64
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 96
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //128
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //160
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //192
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, //224
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5); //256
|
|
|
|
function GetUtf8CharWidth(firstchar:AnsiChar):integer;
|
|
begin
|
|
result:=bytesFromUTF8[firstchar]+1;
|
|
end;
|
|
|
|
function IsFirstUTF8Char(thechar:AnsiChar):boolean;
|
|
{The remaining bytes in a multi-byte sequence have 10 as their two most significant bits.}
|
|
begin
|
|
result:=(byte(thechar) and (128+64))<>128;
|
|
end;
|
|
|
|
function IsSecondaryUTF8Char(thechar:AnsiChar):boolean;
|
|
{The remaining bytes in a multi-byte sequence have 10 as their two most significant bits.}
|
|
begin
|
|
result:=(byte(thechar) and (128+64))=128;
|
|
end;
|
|
|
|
function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
|
|
{Buffer contains only valid UTF-8 characters, no secondary alone,
|
|
no primary without the correct nr of secondary}
|
|
var p:PAnsiChar;
|
|
utf8bytes:integer;
|
|
hadutf8bytes:boolean;
|
|
begin
|
|
p:=buf;
|
|
hadutf8bytes:=false;
|
|
result:=false;
|
|
utf8bytes:=0;
|
|
while p[0]<>#0 do begin
|
|
if utf8bytes>0 then begin {Expecting secondary AnsiChar}
|
|
hadutf8bytes:=true;
|
|
if not IsSecondaryUTF8Char(p[0]) then exit; {Fail!}
|
|
dec(utf8bytes);
|
|
end else if IsFirstUTF8Char(p[0]) then
|
|
utf8bytes:=GetUtf8CharWidth(p[0])-1
|
|
else if IsSecondaryUTF8Char(p[0]) then
|
|
exit; {Fail!}
|
|
inc(p);
|
|
end;
|
|
result:=hadutf8bytes and (PartialAllowed or (utf8bytes=0));
|
|
end;
|
|
|
|
end.
|