lasarus_compotents/ATSynEdit/atsynedit/atstringproc_utf8detect.pas

69 lines
2.2 KiB
ObjectPascal

//Code by Christian Ghisler (ghisler.com)
//Christian gave code to open-source at TCmd forum
unit atstringproc_utf8detect;
{$mode objfpc}{$H+}
interface
//PartialAllowed must be set to true if the buffer is smaller than the file.
function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
implementation
const bytesFromUTF8:array[AnsiChar] of byte = (
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 64
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 96
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //128
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //160
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //192
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, //224
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5); //256
function GetUtf8CharWidth(firstchar:AnsiChar):integer;
begin
result:=bytesFromUTF8[firstchar]+1;
end;
function IsFirstUTF8Char(thechar:AnsiChar):boolean;
{The remaining bytes in a multi-byte sequence have 10 as their two most significant bits.}
begin
result:=(byte(thechar) and (128+64))<>128;
end;
function IsSecondaryUTF8Char(thechar:AnsiChar):boolean;
{The remaining bytes in a multi-byte sequence have 10 as their two most significant bits.}
begin
result:=(byte(thechar) and (128+64))=128;
end;
function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
{Buffer contains only valid UTF-8 characters, no secondary alone,
no primary without the correct nr of secondary}
var p:PAnsiChar;
utf8bytes:integer;
hadutf8bytes:boolean;
begin
p:=buf;
hadutf8bytes:=false;
result:=false;
utf8bytes:=0;
while p[0]<>#0 do begin
if utf8bytes>0 then begin {Expecting secondary AnsiChar}
hadutf8bytes:=true;
if not IsSecondaryUTF8Char(p[0]) then exit; {Fail!}
dec(utf8bytes);
end else if IsFirstUTF8Char(p[0]) then
utf8bytes:=GetUtf8CharWidth(p[0])-1
else if IsSecondaryUTF8Char(p[0]) then
exit; {Fail!}
inc(p);
end;
result:=hadutf8bytes and (PartialAllowed or (utf8bytes=0));
end;
end.