(* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is TurboPower Abbrevia * * The Initial Developer of the Original Code is * Craig Peterson * * Portions created by the Initial Developer are Copyright (C) 2011 * the Initial Developer. All Rights Reserved. * * Contributor(s): * * ***** END LICENSE BLOCK ***** *) {*********************************************************} {* ABBREVIA: AbCharset.pas *} {*********************************************************} {* ABBREVIA: Types and routines for working with various *} {* character encodings. *} {*********************************************************} unit AbCharset; {$I AbDefine.inc} interface {$IFDEF MSWINDOWS} uses Windows; {$ENDIF} { Unicode backwards compatibility types } {$IF NOT DECLARED(RawByteString)} type RawByteString = AnsiString; {$IFEND} {$IF NOT DECLARED(UnicodeString)} type UnicodeString = WideString; {$IFEND} type TAbCharSet = (csASCII, csANSI, csUTF8); function AbDetectCharSet(const aValue: RawByteString): TAbCharSet; function AbIsOEM(const aValue: RawByteString): Boolean; function AbRawBytesToString(const aValue: RawByteString): string; function AbStringToUnixBytes(const aValue: string): RawByteString; function AbSysCharSetIsUTF8: Boolean; {$IFDEF MSWINDOWS} function AbTryEncode(const aValue: UnicodeString; aCodePage: UINT; aAllowBestFit: Boolean; out aResult: AnsiString): Boolean; {$ENDIF} { Unicode backwards compatibility functions } {$IFNDEF UNICODE} function UTF8ToString(const S: RawByteString): string; {$ENDIF} implementation uses {$IFDEF LibcAPI} Libc, {$ENDIF} SysUtils; function AbDetectCharSet(const aValue: RawByteString): TAbCharSet; var i, TrailCnt: Integer; begin Result := csASCII; TrailCnt := 0; for i := 1 to Length(aValue) do begin if Byte(aValue[i]) >= $80 then Result := csANSI; if TrailCnt > 0 then if Byte(aValue[i]) in [$80..$BF] then Dec(TrailCnt) else Exit else if Byte(aValue[i]) in [$80..$BF] then Exit else case Byte(aValue[i]) of $C0..$C1, $F5..$FF: Exit; $C2..$DF: TrailCnt := 1; $E0..$EF: TrailCnt := 2; $F0..$F4: TrailCnt := 3; end; end; if (TrailCnt = 0) and (Result = csANSI) then Result := csUTF8; end; { -------------------------------------------------------------------------- } function AbIsOEM(const aValue: RawByteString): Boolean; // Detect whether a string of bytes is likely to be the system's ANSI or OEM codepage {$IFDEF MSWINDOWS} const // Byte values of alpha-numeric characters in OEM and ANSI codepages. // Excludes NBSP, ordinal indicators, exponents, the florin symbol, and, for // ANSI codepages matched to certain OEM ones, the micro character. // // US (OEM 437, ANSI 1252) Oem437AnsiChars = [138, 140, 142, 154, 156, 158, 159, 181, 192..214, 216..246, 248..255]; Oem437OemChars = [128..154, 160..165, 224..235, 237, 238]; // Arabic (OEM 720, ANSI 1256) Oem720AnsiChars = [129, 138, 140..144, 152, 154, 156, 159, 170, 181, 192..214, 216..239, 244, 249, 251, 252, 255]; Oem720OemChars = [130, 131, 133, 135..140, 147, 149..155, 157..173, 224..239]; // Greek (OEM 737, ANSI 1253) Oem737AnsiChars = [162, 181, 184..186, 188, 190..209, 211..254]; Oem737OemChars = [128..175, 224..240, 244, 245]; // Baltic Rim (OEM 775, ANSI 1257) Oem775AnsiChars = [168, 170, 175, 184, 186, 191..214, 216..246, 248..254]; Oem775OemChars = [128..149, 151..155, 157, 160..165, 173, 181..184, 189, 190, 198, 199, 207..216, 224..238]; // Western European (OEM 850, ANSI 1252) Oem850AnsiChars = [138, 140, 142, 154, 156, 158, 159, 192..214, 216..246, 248..255]; Oem850OemChars = [128..155, 157, 160..165, 181..183, 198, 199, 208..216, 222, 224..237]; // Central & Eastern European (OEM 852, ANSI 1250) Oem852AnsiChars = [138, 140..143, 154, 156..159, 163, 165, 170, 175, 179, 185, 186, 188, 190..214, 216..246, 248..254]; Oem852OemChars = [128..157, 159..169, 171..173, 181..184, 189, 190, 198, 199, 208..216, 221, 222, 224..238, 251..253]; // Cyrillic (OEM 855, ANSI 1251) Oem855AnsiChars = [128, 129, 131, 138, 140..144, 154, 156..159, 161..163, 165, 168, 170, 175, 178..180, 184, 186, 188..255]; Oem855OemChars = [128..173, 181..184, 189, 190, 198, 199, 208..216, 221, 222, 224..238, 241..252]; // Turkish (OEM 857, ANSI 1254) Oem857AnsiChars = [138, 140, 154, 156, 159, 192..214, 216..246, 248..255]; Oem857OemChars = [128..155, 157..167, 181..183, 198, 199, 210..212, 214..216, 222, 224..230, 233..237]; // Hebrew (OEM 862, ANSI 1255) Oem862AnsiChars = [181, 212..214, 224..250]; Oem862OemChars = [128..154, 160..165, 224..235, 237, 238]; // Cyrillic CIS (OEM 866, ANSI 1251) Oem866AnsiChars = [128, 129, 131, 138, 140..144, 154, 156..159, 161..163, 165, 168, 170, 175, 178..181, 184, 186, 188..255]; Oem866OemChars = [128..175, 224..247]; var AnsiChars, OemChars: set of Byte; IsANSI: Boolean; i: Integer; begin case GetOEMCP of 437: begin AnsiChars := Oem437AnsiChars; OemChars := Oem437OemChars; end; 720: begin AnsiChars := Oem720AnsiChars; OemChars := Oem720OemChars; end; 737: begin AnsiChars := Oem737AnsiChars; OemChars := Oem737OemChars; end; 775: begin AnsiChars := Oem775AnsiChars; OemChars := Oem775OemChars; end; 850: begin AnsiChars := Oem850AnsiChars; OemChars := Oem850OemChars; end; 852: begin AnsiChars := Oem852AnsiChars; OemChars := Oem852OemChars; end; 855: begin AnsiChars := Oem855AnsiChars; OemChars := Oem855OemChars; end; 857: begin AnsiChars := Oem857AnsiChars; OemChars := Oem857OemChars; end; 862: begin AnsiChars := Oem862AnsiChars; OemChars := Oem862OemChars; end; 866: begin AnsiChars := Oem866AnsiChars; OemChars := Oem866OemChars; end; else begin Result := False; Exit; end; end; IsANSI := True; Result := True; for i := 0 to Length(aValue) do if Ord(aValue[i]) >= $80 then begin if IsANSI then IsANSI := Ord(aValue[i]) in AnsiChars; if Result then Result := Ord(aValue[i]) in OemChars; if not IsANSI and not Result then Break end; if IsANSI then Result := False; end; {$ELSE !MSWINDOWS} begin Result := False; end; {$ENDIF !MSWINDOWS} { -------------------------------------------------------------------------- } function AbSysCharSetIsUTF8: Boolean; begin {$IFDEF DARWIN} Result := True; {$ENDIF} {$IFDEF MSWINDOWS} Result := False; {$ENDIF} {$IFDEF LINUX} Result := StrComp(nl_langinfo(_NL_CTYPE_CODESET_NAME), 'UTF-8') = 0; {$ENDIF} end; { -------------------------------------------------------------------------- } function AbRawBytesToString(const aValue: RawByteString): string; // Detect encoding of raw bytes and convert to a string begin case AbDetectCharSet(aValue) of csASCII: Result := string(aValue); csANSI: begin {$IFDEF MSWINDOWS} if AbIsOEM(aValue) then begin SetLength(Result, Length(aValue)); OemToCharBuff(PAnsiChar(aValue), PChar(Result), Length(Result)); end else {$ENDIF} Result := string(aValue); end; csUTF8: Result := UTF8ToString(aValue); end; end; { -------------------------------------------------------------------------- } function AbStringToUnixBytes(const aValue: string): RawByteString; // Convert from a string to an appropriate encoding for Unix archive types (tar/gz) // Based on testing the system encoding should be used on Linux, and UTF-8 // everywhere else. Windows apps don't agree on whether to use ANSI, OEM, or UTF-8. begin // Delphi XE2+ Posix platforms only support the UTF-8 locale {$IF DEFINED(LINUX) AND (DEFINED(FPC) OR DEFINED(KYLIX))} Result := AnsiString(aValue); {$ELSE} Result := UTF8Encode(aValue); {$IFEND} end; { -------------------------------------------------------------------------- } {$IFDEF MSWINDOWS} function AbTryEncode(const aValue: UnicodeString; aCodePage: UINT; aAllowBestFit: Boolean; out aResult: AnsiString): Boolean; // Try to encode the given Unicode string as the requested codepage const WC_NO_BEST_FIT_CHARS = $00000400; Flags: array[Boolean] of DWORD = (WC_NO_BEST_FIT_CHARS, 0); var UsedDefault: BOOL; begin if not aAllowBestFit and not CheckWin32Version(4, 1) then Result := False else begin SetLength(aResult, WideCharToMultiByte(aCodePage, Flags[aAllowBestFit], PWideChar(aValue), Length(aValue), nil, 0, nil, @UsedDefault)); SetLength(aResult, WideCharToMultiByte(aCodePage, Flags[aAllowBestFit], PWideChar(aValue), Length(aValue), PAnsiChar(aResult), Length(aResult), nil, @UsedDefault)); Result := not UsedDefault; end; end; {$ENDIF MSWINDOWS} { == Unicode backwards compatibility functions ============================= } {$IFNDEF UNICODE} function UTF8ToString(const S: RawByteString): string; begin Result := UTf8ToAnsi(S); end; {$ENDIF} end.