mirror of
https://github.com/quizhizhe/LiteLoaderBDS-1.16.40.git
synced 2025-06-09 13:08:11 +00:00
300 lines
10 KiB
C
300 lines
10 KiB
C
// Copyright 2016 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#ifndef UTIL_ENCODINGS_ENCODINGS_H_
|
|
#define UTIL_ENCODINGS_ENCODINGS_H_
|
|
|
|
// This interface defines the Encoding enum and various functions that
|
|
// depend only on Encoding values.
|
|
|
|
// A hash-function for Encoding, hash<Encoding>, is defined in
|
|
// i18n/encodings/public/encodings-hash.h
|
|
|
|
// On some Windows projects, UNICODE may be defined, which would prevent the
|
|
// Encoding enum below from compiling. Note that this is a quick fix that does
|
|
// not break any existing projects. The UNICODE enum may someday be changed
|
|
// to something more specific and non-colliding, but this involves careful
|
|
// testing of changes in many other projects.
|
|
#undef UNICODE
|
|
|
|
// NOTE: The Encoding enum must always start at 0. This assumption has
|
|
// been made and used.
|
|
|
|
#ifndef SWIG
|
|
|
|
#include "encodings.pb.h"
|
|
|
|
#else
|
|
|
|
// TODO: Include a SWIG workaround header file.
|
|
|
|
#endif
|
|
|
|
const int kNumEncodings = NUM_ENCODINGS;
|
|
|
|
// some of the popular encoding aliases
|
|
// TODO: Make these static const Encoding values instead of macros.
|
|
#define LATIN1 ISO_8859_1
|
|
#define LATIN2 ISO_8859_2
|
|
#define LATIN3 ISO_8859_3
|
|
#define LATIN4 ISO_8859_4
|
|
#define CYRILLIC ISO_8859_5
|
|
#define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
|
|
#define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
|
|
#define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
|
|
#define LATIN5 ISO_8859_9
|
|
#define LATIN6 ISO_8859_10
|
|
#define KOREAN_HANGUL KOREAN_EUC_KR
|
|
|
|
// The default Encoding (LATIN1).
|
|
Encoding default_encoding();
|
|
|
|
|
|
|
|
// *************************************************************
|
|
// Encoding predicates
|
|
// IsValidEncoding()
|
|
// IsEncEncCompatible
|
|
// IsSupersetOfAscii7Bit
|
|
// Is8BitEncoding
|
|
// IsCJKEncoding
|
|
// IsHebrewEncoding
|
|
// IsRightToLeftEncoding
|
|
// IsLogicalRightToLeftEncoding
|
|
// IsVisualRightToLeftEncoding
|
|
// IsIso2022Encoding
|
|
// IsIso2022JpOrVariant
|
|
// IsShiftJisOrVariant
|
|
// IsJapaneseCellPhoneCarrierSpecificEncoding
|
|
// *************************************************************
|
|
|
|
// IsValidEncoding
|
|
// ===================================
|
|
//
|
|
// Function to check if the input language enum is within range.
|
|
//
|
|
|
|
bool IsValidEncoding(Encoding enc);
|
|
|
|
//
|
|
// IsEncEncCompatible
|
|
// ------------------
|
|
//
|
|
// This function is to determine whether or not converting from the
|
|
// first encoding to the second requires any changes to the underlying
|
|
// text (e.g. ASCII_7BIT is a subset of UTF8).
|
|
//
|
|
// TODO: the current implementation is likely incomplete. It would be
|
|
// good to consider the full matrix of all pairs of encodings and to fish out
|
|
// all compatible pairs.
|
|
//
|
|
bool IsEncEncCompatible(const Encoding from, const Encoding to);
|
|
|
|
// To be a superset of 7-bit Ascii means that bytes 0...127 in the given
|
|
// encoding represent the same characters as they do in ISO_8859_1.
|
|
|
|
// WARNING: This function does not currently return true for all encodings that
|
|
// are supersets of Ascii 7-bit.
|
|
bool IsSupersetOfAscii7Bit(Encoding e);
|
|
|
|
// To be an 8-bit encoding means that there are fewer than 256 symbols.
|
|
// Each byte determines a new character; there are no multi-byte sequences.
|
|
|
|
// WARNING: This function does not currently return true for all encodings that
|
|
// are 8-bit encodings.
|
|
bool Is8BitEncoding(Encoding e);
|
|
|
|
// IsCJKEncoding
|
|
// -------------
|
|
//
|
|
// This function returns true if the encoding is either Chinese
|
|
// (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
|
|
// considered a CJK encoding.
|
|
bool IsCJKEncoding(Encoding e);
|
|
|
|
// IsHebrewEncoding
|
|
// -------------
|
|
//
|
|
// This function returns true if the encoding is a Hebrew specific
|
|
// encoding (not UTF8, etc).
|
|
bool IsHebrewEncoding(Encoding e);
|
|
|
|
// IsRightToLeftEncoding
|
|
// ---------------------
|
|
//
|
|
// Returns true if the encoding is a right-to-left encoding.
|
|
//
|
|
// Note that the name of this function is somewhat misleading. There is nothing
|
|
// "right to left" about these encodings. They merely contain code points for
|
|
// characters in RTL languages such as Hebrew and Arabic. But this is also
|
|
// true for UTF-8.
|
|
//
|
|
// TODO: Get rid of this function. The only special-case we
|
|
// should need to worry about are visual encodings. Anything we
|
|
// need to do for all 'RTL' encodings we need to do for UTF-8 as well.
|
|
bool IsRightToLeftEncoding(Encoding enc);
|
|
|
|
// IsLogicalRightToLeftEncoding
|
|
// ----------------------------
|
|
//
|
|
// Returns true if the encoding is a logical right-to-left encoding.
|
|
// Logical right-to-left encodings are those that the browser renders
|
|
// right-to-left and applies the BiDi algorithm to. Therefore the characters
|
|
// appear in reading order in the file, and indexing, snippet generation etc.
|
|
// should all just work with no special processing.
|
|
//
|
|
// TODO: Get rid of this function. The only special-case we
|
|
// should need to worry about are visual encodings.
|
|
bool IsLogicalRightToLeftEncoding(Encoding enc);
|
|
|
|
// IsVisualRightToLeftEncoding
|
|
// ---------------------------
|
|
//
|
|
// Returns true if the encoding is a visual right-to-left encoding.
|
|
// Visual right-to-left encodings are those that the browser renders
|
|
// left-to-right and does not apply the BiDi algorithm to. Therefore each
|
|
// line appears in reverse order in the file, lines are manually wrapped
|
|
// by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
|
|
// the prehistoric days when browsers couldn't render right-to-left, but
|
|
// unfortunately some visual pages persist to this day. These documents require
|
|
// special processing so that we don't index or snippet them with each line
|
|
// reversed.
|
|
bool IsVisualRightToLeftEncoding(Encoding enc);
|
|
|
|
// IsIso2022Encoding
|
|
// -----------------
|
|
//
|
|
// Returns true if the encoding is a kind of ISO 2022 such as
|
|
// ISO-2022-JP.
|
|
bool IsIso2022Encoding(Encoding enc);
|
|
|
|
// IsIso2022JpOrVariant
|
|
// --------------------
|
|
//
|
|
// Returns true if the encoding is ISO-2022-JP or a variant such as
|
|
// KDDI's ISO-2022-JP.
|
|
bool IsIso2022JpOrVariant(Encoding enc);
|
|
|
|
// IsShiftJisOrVariant
|
|
// --------------------
|
|
//
|
|
// Returns true if the encoding is Shift_JIS or a variant such as
|
|
// KDDI's Shift_JIS.
|
|
bool IsShiftJisOrVariant(Encoding enc);
|
|
|
|
// IsJapanesCellPhoneCarrierSpecificEncoding
|
|
// -----------------------------------------
|
|
//
|
|
// Returns true if it's Japanese cell phone carrier specific encoding
|
|
// such as KDDI_SHIFT_JIS.
|
|
bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
|
|
|
|
|
|
|
|
// *************************************************************
|
|
// ENCODING NAMES
|
|
//
|
|
// This interface defines a standard name for each valid encoding, and
|
|
// a standard name for invalid encodings. (Some names use all upper
|
|
// case, but others use mixed case.)
|
|
//
|
|
// EncodingName() [Encoding to name]
|
|
// MimeEncodingName() [Encoding to name]
|
|
// EncodingFromName() [name to Encoding]
|
|
// EncodingNameAliasToEncoding() [name to Encoding]
|
|
// default_encoding_name()
|
|
// invalid_encoding_name()
|
|
// *************************************************************
|
|
|
|
// EncodingName
|
|
// ------------
|
|
//
|
|
// Given the encoding, returns its standard name.
|
|
// Return invalid_encoding_name() if the encoding is invalid.
|
|
//
|
|
const char* EncodingName(Encoding enc);
|
|
|
|
//
|
|
// MimeEncodingName
|
|
// ----------------
|
|
//
|
|
// Return the "preferred MIME name" of an encoding.
|
|
//
|
|
// This name is suitable for using in HTTP headers, HTML tags,
|
|
// and as the "charset" parameter of a MIME Content-Type.
|
|
const char* MimeEncodingName(Encoding enc);
|
|
|
|
|
|
// The maximum length of an encoding name
|
|
const int kMaxEncodingNameSize = 50;
|
|
|
|
// The standard name of the default encoding.
|
|
const char* default_encoding_name();
|
|
|
|
// The name used for an invalid encoding.
|
|
const char* invalid_encoding_name();
|
|
|
|
// EncodingFromName
|
|
// ----------------
|
|
//
|
|
// If enc_name matches the standard name of an Encoding, using a
|
|
// case-insensitive comparison, set *encoding to that Encoding and
|
|
// return true. Otherwise set *encoding to UNKNOWN_ENCODING and
|
|
// return false.
|
|
//
|
|
// REQUIRES: encoding must not be NULL.
|
|
//
|
|
bool EncodingFromName(const char* enc_name, Encoding *encoding);
|
|
|
|
//
|
|
// EncodingNameAliasToEncoding
|
|
// ---------------------------
|
|
//
|
|
// If enc_name matches the standard name or an alias of an Encoding,
|
|
// using a case-insensitive comparison, return that
|
|
// Encoding. Otherwise, return UNKNOWN_ENCODING.
|
|
//
|
|
// Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
|
|
// GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
|
|
// common variations with hyphens and underscores (e.g., "koi8-u" and
|
|
// "koi8u" for RUSSIAN_KOI8_R).
|
|
|
|
Encoding EncodingNameAliasToEncoding(const char *enc_name);
|
|
|
|
// *************************************************************
|
|
// Miscellany
|
|
// *************************************************************
|
|
|
|
// PreferredWebOutputEncoding
|
|
// --------------------------
|
|
//
|
|
// Some multi-byte encodings use byte values that coincide with the
|
|
// ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
|
|
// can misinterpret these, as indicated in an external XSS report from
|
|
// 2007-02-15. Here, we map these dangerous encodings to safer ones. We
|
|
// also use UTF8 instead of encodings that we don't support in our
|
|
// output, and we generally try to be conservative in what we send out.
|
|
// Where the client asks for single- or double-byte encodings that are
|
|
// not as common, we substitute a more common single- or double-byte
|
|
// encoding, if there is one, thereby preserving the client's intent
|
|
// to use less space than UTF-8. This also means that characters
|
|
// outside the destination set will be converted to HTML NCRs (&#NNN;)
|
|
// if requested.
|
|
Encoding PreferredWebOutputEncoding(Encoding enc);
|
|
|
|
|
|
#endif // UTIL_ENCODINGS_ENCODINGS_H_
|