UTF-8 to UTF-16
Sun, 20 Jan 2008 07:58:42 +0000 - Author: Peter O.
The C function below properly converts UTF-8 characters to UTF-16, for use in Windows functions, etc. The example also shows how to retrieve individual characters from a UTF-16 string. If you don't want to include windows.h, use this header:
typedef unsigned short WCHAR; typedef signed long LONG; typedef unsigned char BYTE; typedef unsigned short WORD; typedef unsigned long DWORD; typedef int BOOL; #ifndef NULL #define NULL ((void*)0) #endif #ifndef TRUE #define TRUE 1 #endif #ifndef FALSE #define FALSE 0 #endif
The code follows. I release it to the public domain.
typedef struct {
BYTE byte1f,byte1t,byte2f,byte2t,
valueMask,trail;
} UTF8TBLROW;
static const UTF8TBLROW utf8tbl[] = {
{0x00,0x7F,0 ,0 ,0x7F,0},
{0xC2,0xDF,0x80,0xBF,0x1F,1},
{0xE0,0xE0,0xA0,0xBF,0x0F,2},
{0xE1,0xEC,0x80,0xBF,0x0F,2},
{0xED,0xED,0x80,0x9F,0x0F,2},
{0xEE,0xEF,0x80,0xBF,0x0F,2},
{0xF0,0xF0,0x90,0xBF,0x07,3},
{0xF1,0xF3,0x80,0xBF,0x07,3},
{0xF4,0xF4,0x80,0x8F,0x07,3},
{0 ,0 ,0 ,0 ,0 ,0}
};
WCHAR *Utf8ToUtf16(char *string){
if(!string)return NULL;
DWORD bytes=(strlen(string)*2+1)*sizeof(WCHAR);
WCHAR *result=malloc(bytes);
WCHAR *presult=result;
while(1){
BYTE b=*string++;
if(b==0){//null terminator
*(presult++)=0;
break;
} else if(b<0x80){//ASCII character
*(presult++)=b;
} else {
DWORD ret=0;
UTF8TBLROW *pRow=&utf8tbl[0];
BOOL bad=FALSE;
//b is the byte retrieved at beginning of loop
while(pRow-mym>valueMask&&((b<pRow-mym>byte1f)||(b>pRow->byte1t))){
pRow++;
}
ret=(DWORD)(b&pRow->valueMask);
if(pRow->trail==0){
//check for 0 trailing bytes already made above
bad=TRUE;
} else if(pRow->trail==1){
b=*string++; // second byte
if(b<pRow-mym>byte2f||b>pRow->byte2t)bad=TRUE;
ret=(ret<<6)|(b&0x3F);
} else if(pRow->trail==2){
b=*string++; // second byte
if(b<pRow-mym>byte2f||b>pRow->byte2t)bad=TRUE;
ret=(ret<<6)|(b&0x3F);
b=*string++; // third byte
if(b<0x80||bmym>0xBF)bad=TRUE;
ret=(ret<<6)|(b&0x3F);
} else if(pRow->trail==3){
b=*string++; // second byte
if(b<pRow-mym>byte2f||b>pRow->byte2t)bad=TRUE;
ret=(ret<<6)|(b&0x3F);
b=*string++; // third byte
if(b<0x80||bmym>0xBF)bad=TRUE;
ret=(ret<<6)|(b&0x3F);
b=*string++; // fourth byte
if(b<0x80||bmym>0xBF)bad=TRUE;
ret=(ret<<6)|(b&0x3F);
}
if(bad){
free(result);
return NULL;
} else if((ret&0xFFFFF800)==0xD800){
free(result);
return NULL;
} else if((ret&0xFFFF0000)==0){ // BMP character
*(presult++)=ret;
} else { // SMP character
ret-=0x10000;
*(presult++)=((ret>>10)&0x03FF)|0xD800;
*(presult++)=((ret )&0x03FF)|0xDC00;
}
}
}
return result;
}