记事本中用到的字符识别(UTF8UNICODE)

技术2022-05-11 73

下面的代码是MS记事本中用的字符识别代码可以有效的识别UTF8或者是UNICODE 第一步是判断存储的文件头，如果没有文件头，则判断字符范围 /* IsTextUTF8 * * UTF-8 is the encoding of Unicode based on Internet Society RFC2279 * * Basicly: * 0000 0000-0000 007F - 0xxxxxxx (ascii converts to 1 octet!) * 0000 0080-0000 07FF - 110xxxxx 10xxxxxx ( 2 octet format) * 0000 0800-0000 FFFF - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format) * (this keeps going for 32 bit unicode) * * * Return value: TRUE, if the text is in UTF-8 format. * FALSE, if the text is not in UTF-8 format. * We will also return FALSE is it is only 7-bit ascii, so the right code page * will be used. * * Actually for 7 bit ascii, it doesn't matter which code page we use, but * notepad will remember that it is utf-8 and "save" or "save as" will store * the file with a UTF-8 BOM. Not cool. */ INT IsTextUTF8( LPSTR lpstrInputStream, INT iLen ) { INT i; DWORD cOctets; // octets to go in this UTF-8 encoded character UCHAR chr; BOOL bAllAscii= TRUE; cOctets= 0; for( i=0; i < iLen; i++ ) { chr= *(lpstrInputStream+i); if( (chr&0x80) != 0 ) bAllAscii= FALSE; if( cOctets == 0 ) { // // 7 bit ascii after 7 bit ascii is just fine. Handle start of encoding case. // if( chr >= 0x80 ) { // // count of the leading 1 bits is the number of characters encoded // do { chr <<= 1; cOctets++; } while( (chr&0x80) != 0 ); cOctets--; // count includes this character if( cOctets == 0 ) return FALSE; // must start with 11xxxxxx } } else { // non-leading bytes must start as 10xxxxxx if( (chr&0xC0) != 0x80 ) { return FALSE; } cOctets--; // processed another octet in encoding } } // // End of text. Check for consistency. // if( cOctets > 0 ) { // anything left over at the end is an error return FALSE; } if( bAllAscii ) { // Not utf-8 if all ascii. Forces caller to use code pages for conversion return FALSE; } return TRUE; } /* IsInputTextUnicode * Verify if the input stream is in Unicode format. * * Return value: TRUE, if the text is in Unicode format. * * 29 June 1998 */ INT IsInputTextUnicode (LPSTR lpstrInputStream, INT iLen) { INT iResult= ~0; // turn on IS_TEXT_UNICODE_DBCS_LEADBYTE BOOL bUnicode; // We would like to check the possibility // of IS_TEXT_UNICODE_DBCS_LEADBYTE. // bUnicode= IsTextUnicode( lpstrInputStream, iLen, &iResult); if (bUnicode && ((iResult & IS_TEXT_UNICODE_STATISTICS) != 0 ) && ((iResult & (~IS_TEXT_UNICODE_STATISTICS)) == 0 ) ) { CPINFO cpiInfo; CHAR* pch= (CHAR*)lpstrInputStream; INT cb; // // If the result depends only upon statistics, check // to see if there is a possibility of DBCS. // Only do this check if the ansi code page is DBCS // GetCPInfo( CP_ACP, &cpiInfo); if( cpiInfo.MaxCharSize > 1 ) { for( cb=0; cb<iLen; cb++ ) { if( IsDBCSLeadByte(*pch++) ) { return FALSE; } } } } return bUnicode; } #define UNICODE_FFFF 0xFFFF #define REVERSE_BYTE_ORDER_MARK 0xFFFE #define BYTE_ORDER_MARK 0xFEFF lpBuf= MapViewOfFile( hMap, FILE_MAP_READ, 0,0,len); lpBufAfterBOM= (LPSTR) lpBuf; if( typeFlag == FT_UNKNOWN ) { switch(*lpBuf) { case BYTE_ORDER_MARK: bUnicode= TRUE; ftOpenedAs= FT_UNICODE; // don't count the BOM. nChars= len / sizeof(TCHAR) -1; break; case REVERSE_BYTE_ORDER_MARK: bUnicode= TRUE; ftOpenedAs= FT_UNICODEBE; // don't count the BOM. nChars= len / sizeof(TCHAR) -1; break; // UTF bom has 3 bytes; if it doesn't have UTF BOM just fall through .. case BOM_UTF8_HALF: if (len > 2 && ((BYTE) *(((LPSTR)lpBuf)+2) == BOM_UTF8_2HALF) ) { bUTF8= TRUE; cpTemp= CP_UTF8; ftOpenedAs= FT_UTF8; // Ignore the first three bytes. lpBufAfterBOM= (LPSTR)lpBuf + 3; len -= 3; break; } default: // Is the file unicode without BOM ? if ((bUnicode= IsInputTextUnicode((LPSTR) lpBuf, len))) { ftOpenedAs= FT_UNICODE; nChars= len / sizeof(TCHAR); } else { // Is the file UTF-8 even though it doesn't have UTF-8 BOM. if ((bUTF8= IsTextUTF8((LPSTR) lpBuf, len))) { ftOpenedAs= FT_UTF8; cpTemp= CP_UTF8; } // well, not it must be an ansi file! else { ftOpenedAs= FT_ANSI; cpTemp= CP_ACP; } } break; } }

专利

最新回复(0)