344 lines
9.6 KiB
ObjectPascal

(* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is TurboPower Abbrevia
*
* The Initial Developer of the Original Code is
* Craig Peterson <capeterson@users.sourceforge.net>
*
* Portions created by the Initial Developer are Copyright (C) 2011
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* ***** END LICENSE BLOCK ***** *)
{*********************************************************}
{* ABBREVIA: AbCharset.pas *}
{*********************************************************}
{* ABBREVIA: Types and routines for working with various *}
{* character encodings. *}
{*********************************************************}
unit AbCharset;
{$I AbDefine.inc}
interface
{$IFDEF MSWINDOWS}
uses
Windows;
{$ENDIF}
{ Unicode backwards compatibility types }
{$IF NOT DECLARED(RawByteString)}
type
RawByteString = AnsiString;
{$IFEND}
{$IF NOT DECLARED(UnicodeString)}
type
UnicodeString = WideString;
{$IFEND}
type
TAbCharSet = (csASCII, csANSI, csUTF8);
function AbDetectCharSet(const aValue: RawByteString): TAbCharSet;
function AbIsOEM(const aValue: RawByteString): Boolean;
function AbRawBytesToString(const aValue: RawByteString): string;
function AbStringToUnixBytes(const aValue: string): RawByteString;
function AbSysCharSetIsUTF8: Boolean;
{$IFDEF MSWINDOWS}
function AbTryEncode(const aValue: UnicodeString; aCodePage: UINT;
aAllowBestFit: Boolean; out aResult: AnsiString): Boolean;
{$ENDIF}
{ Unicode backwards compatibility functions }
{$IFNDEF UNICODE}
function UTF8ToString(const S: RawByteString): string;
{$ENDIF}
implementation
uses
{$IFDEF LibcAPI}
Libc,
{$ENDIF}
SysUtils;
function AbDetectCharSet(const aValue: RawByteString): TAbCharSet;
var
i, TrailCnt: Integer;
begin
Result := csASCII;
TrailCnt := 0;
for i := 1 to Length(aValue) do begin
if Byte(aValue[i]) >= $80 then
Result := csANSI;
if TrailCnt > 0 then
if Byte(aValue[i]) in [$80..$BF] then
Dec(TrailCnt)
else Exit
else if Byte(aValue[i]) in [$80..$BF] then
Exit
else
case Byte(aValue[i]) of
$C0..$C1, $F5..$FF: Exit;
$C2..$DF: TrailCnt := 1;
$E0..$EF: TrailCnt := 2;
$F0..$F4: TrailCnt := 3;
end;
end;
if (TrailCnt = 0) and (Result = csANSI) then
Result := csUTF8;
end;
{ -------------------------------------------------------------------------- }
function AbIsOEM(const aValue: RawByteString): Boolean;
// Detect whether a string of bytes is likely to be the system's ANSI or OEM codepage
{$IFDEF MSWINDOWS}
const
// Byte values of alpha-numeric characters in OEM and ANSI codepages.
// Excludes NBSP, ordinal indicators, exponents, the florin symbol, and, for
// ANSI codepages matched to certain OEM ones, the micro character.
//
// US (OEM 437, ANSI 1252)
Oem437AnsiChars =
[138, 140, 142, 154, 156, 158, 159, 181, 192..214, 216..246, 248..255];
Oem437OemChars =
[128..154, 160..165, 224..235, 237, 238];
// Arabic (OEM 720, ANSI 1256)
Oem720AnsiChars =
[129, 138, 140..144, 152, 154, 156, 159, 170, 181, 192..214, 216..239, 244,
249, 251, 252, 255];
Oem720OemChars =
[130, 131, 133, 135..140, 147, 149..155, 157..173, 224..239];
// Greek (OEM 737, ANSI 1253)
Oem737AnsiChars =
[162, 181, 184..186, 188, 190..209, 211..254];
Oem737OemChars =
[128..175, 224..240, 244, 245];
// Baltic Rim (OEM 775, ANSI 1257)
Oem775AnsiChars =
[168, 170, 175, 184, 186, 191..214, 216..246, 248..254];
Oem775OemChars =
[128..149, 151..155, 157, 160..165, 173, 181..184, 189, 190, 198, 199,
207..216, 224..238];
// Western European (OEM 850, ANSI 1252)
Oem850AnsiChars =
[138, 140, 142, 154, 156, 158, 159, 192..214, 216..246, 248..255];
Oem850OemChars =
[128..155, 157, 160..165, 181..183, 198, 199, 208..216, 222, 224..237];
// Central & Eastern European (OEM 852, ANSI 1250)
Oem852AnsiChars =
[138, 140..143, 154, 156..159, 163, 165, 170, 175, 179, 185, 186, 188,
190..214, 216..246, 248..254];
Oem852OemChars =
[128..157, 159..169, 171..173, 181..184, 189, 190, 198, 199, 208..216, 221,
222, 224..238, 251..253];
// Cyrillic (OEM 855, ANSI 1251)
Oem855AnsiChars =
[128, 129, 131, 138, 140..144, 154, 156..159, 161..163, 165, 168, 170, 175,
178..180, 184, 186, 188..255];
Oem855OemChars =
[128..173, 181..184, 189, 190, 198, 199, 208..216, 221, 222, 224..238,
241..252];
// Turkish (OEM 857, ANSI 1254)
Oem857AnsiChars =
[138, 140, 154, 156, 159, 192..214, 216..246, 248..255];
Oem857OemChars =
[128..155, 157..167, 181..183, 198, 199, 210..212, 214..216, 222, 224..230,
233..237];
// Hebrew (OEM 862, ANSI 1255)
Oem862AnsiChars =
[181, 212..214, 224..250];
Oem862OemChars =
[128..154, 160..165, 224..235, 237, 238];
// Cyrillic CIS (OEM 866, ANSI 1251)
Oem866AnsiChars =
[128, 129, 131, 138, 140..144, 154, 156..159, 161..163, 165, 168, 170, 175,
178..181, 184, 186, 188..255];
Oem866OemChars =
[128..175, 224..247];
var
AnsiChars, OemChars: set of Byte;
IsANSI: Boolean;
i: Integer;
begin
case GetOEMCP of
437:
begin
AnsiChars := Oem437AnsiChars;
OemChars := Oem437OemChars;
end;
720:
begin
AnsiChars := Oem720AnsiChars;
OemChars := Oem720OemChars;
end;
737:
begin
AnsiChars := Oem737AnsiChars;
OemChars := Oem737OemChars;
end;
775:
begin
AnsiChars := Oem775AnsiChars;
OemChars := Oem775OemChars;
end;
850:
begin
AnsiChars := Oem850AnsiChars;
OemChars := Oem850OemChars;
end;
852:
begin
AnsiChars := Oem852AnsiChars;
OemChars := Oem852OemChars;
end;
855:
begin
AnsiChars := Oem855AnsiChars;
OemChars := Oem855OemChars;
end;
857:
begin
AnsiChars := Oem857AnsiChars;
OemChars := Oem857OemChars;
end;
862:
begin
AnsiChars := Oem862AnsiChars;
OemChars := Oem862OemChars;
end;
866:
begin
AnsiChars := Oem866AnsiChars;
OemChars := Oem866OemChars;
end;
else
begin
Result := False;
Exit;
end;
end;
IsANSI := True;
Result := True;
for i := 0 to Length(aValue) do
if Ord(aValue[i]) >= $80 then
begin
if IsANSI then
IsANSI := Ord(aValue[i]) in AnsiChars;
if Result then
Result := Ord(aValue[i]) in OemChars;
if not IsANSI and not Result then
Break
end;
if IsANSI then
Result := False;
end;
{$ELSE !MSWINDOWS}
begin
Result := False;
end;
{$ENDIF !MSWINDOWS}
{ -------------------------------------------------------------------------- }
function AbSysCharSetIsUTF8: Boolean;
begin
{$IFDEF DARWIN}
Result := True;
{$ENDIF}
{$IFDEF MSWINDOWS}
Result := False;
{$ENDIF}
{$IFDEF LINUX}
Result := StrComp(nl_langinfo(_NL_CTYPE_CODESET_NAME), 'UTF-8') = 0;
{$ENDIF}
end;
{ -------------------------------------------------------------------------- }
function AbRawBytesToString(const aValue: RawByteString): string;
// Detect encoding of raw bytes and convert to a string
begin
case AbDetectCharSet(aValue) of
csASCII:
Result := string(aValue);
csANSI: begin
{$IFDEF MSWINDOWS}
if AbIsOEM(aValue) then begin
SetLength(Result, Length(aValue));
OemToCharBuff(PAnsiChar(aValue), PChar(Result), Length(Result));
end
else
{$ENDIF}
Result := string(aValue);
end;
csUTF8:
Result := UTF8ToString(aValue);
end;
end;
{ -------------------------------------------------------------------------- }
function AbStringToUnixBytes(const aValue: string): RawByteString;
// Convert from a string to an appropriate encoding for Unix archive types (tar/gz)
// Based on testing the system encoding should be used on Linux, and UTF-8
// everywhere else. Windows apps don't agree on whether to use ANSI, OEM, or UTF-8.
begin
// Delphi XE2+ Posix platforms only support the UTF-8 locale
{$IF DEFINED(LINUX) AND (DEFINED(FPC) OR DEFINED(KYLIX))}
Result := AnsiString(aValue);
{$ELSE}
Result := UTF8Encode(aValue);
{$IFEND}
end;
{ -------------------------------------------------------------------------- }
{$IFDEF MSWINDOWS}
function AbTryEncode(const aValue: UnicodeString; aCodePage: UINT;
aAllowBestFit: Boolean; out aResult: AnsiString): Boolean;
// Try to encode the given Unicode string as the requested codepage
const
WC_NO_BEST_FIT_CHARS = $00000400;
Flags: array[Boolean] of DWORD = (WC_NO_BEST_FIT_CHARS, 0);
var
UsedDefault: BOOL;
begin
if not aAllowBestFit and not CheckWin32Version(4, 1) then
Result := False
else begin
SetLength(aResult, WideCharToMultiByte(aCodePage, Flags[aAllowBestFit],
PWideChar(aValue), Length(aValue), nil, 0, nil, @UsedDefault));
SetLength(aResult, WideCharToMultiByte(aCodePage, Flags[aAllowBestFit],
PWideChar(aValue), Length(aValue), PAnsiChar(aResult),
Length(aResult), nil, @UsedDefault));
Result := not UsedDefault;
end;
end;
{$ENDIF MSWINDOWS}
{ == Unicode backwards compatibility functions ============================= }
{$IFNDEF UNICODE}
function UTF8ToString(const S: RawByteString): string;
begin
Result := UTf8ToAnsi(S);
end;
{$ENDIF}
end.