UTF-8 to UTF-16

Sun, 20 Jan 2008 07:58:42 +0000 - Author: Peter O.

The C function below properly converts UTF-8 characters to UTF-16, for use in Windows functions, etc. The example also shows how to retrieve individual characters from a UTF-16 string. If you don't want to include windows.h, use this header:



typedef unsigned short WCHAR;
typedef signed long LONG;
typedef unsigned char BYTE;
typedef unsigned short WORD;
typedef unsigned long DWORD;
typedef int BOOL;
#ifndef NULL
#define NULL ((void*)0)
#endif
#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif


The code follows. I release it to the public domain.



typedef struct {
 BYTE byte1f,byte1t,byte2f,byte2t,
      valueMask,trail;
} UTF8TBLROW;

static const UTF8TBLROW utf8tbl[] = { {0x00,0x7F,0 ,0 ,0x7F,0}, {0xC2,0xDF,0x80,0xBF,0x1F,1}, {0xE0,0xE0,0xA0,0xBF,0x0F,2}, {0xE1,0xEC,0x80,0xBF,0x0F,2}, {0xED,0xED,0x80,0x9F,0x0F,2}, {0xEE,0xEF,0x80,0xBF,0x0F,2}, {0xF0,0xF0,0x90,0xBF,0x07,3}, {0xF1,0xF3,0x80,0xBF,0x07,3}, {0xF4,0xF4,0x80,0x8F,0x07,3}, {0 ,0 ,0 ,0 ,0 ,0} };

WCHAR *Utf8ToUtf16(char *string){ if(!string)return NULL; DWORD bytes=(strlen(string)*2+1)*sizeof(WCHAR); WCHAR *result=malloc(bytes); WCHAR *presult=result; while(1){ BYTE b=*string++; if(b==0){//null terminator *(presult++)=0; break; } else if(b<0x80){//ASCII character *(presult++)=b; } else { DWORD ret=0; UTF8TBLROW *pRow=&utf8tbl[0]; BOOL bad=FALSE; //b is the byte retrieved at beginning of loop while(pRow-mym>valueMask&&((b<pRow-mym>byte1f)||(b>pRow->byte1t))){ pRow++; } ret=(DWORD)(b&pRow->valueMask); if(pRow->trail==0){ //check for 0 trailing bytes already made above bad=TRUE; } else if(pRow->trail==1){ b=*string++; // second byte if(b<pRow-mym>byte2f||b>pRow->byte2t)bad=TRUE; ret=(ret<<6)|(b&0x3F); } else if(pRow->trail==2){ b=*string++; // second byte if(b<pRow-mym>byte2f||b>pRow->byte2t)bad=TRUE; ret=(ret<<6)|(b&0x3F); b=*string++; // third byte if(b<0x80||bmym>0xBF)bad=TRUE; ret=(ret<<6)|(b&0x3F); } else if(pRow->trail==3){ b=*string++; // second byte if(b<pRow-mym>byte2f||b>pRow->byte2t)bad=TRUE; ret=(ret<<6)|(b&0x3F); b=*string++; // third byte if(b<0x80||bmym>0xBF)bad=TRUE; ret=(ret<<6)|(b&0x3F); b=*string++; // fourth byte if(b<0x80||bmym>0xBF)bad=TRUE; ret=(ret<<6)|(b&0x3F); } if(bad){ free(result); return NULL; } else if((ret&0xFFFFF800)==0xD800){ free(result); return NULL; } else if((ret&0xFFFF0000)==0){ // BMP character *(presult++)=ret; } else { // SMP character ret-=0x10000; *(presult++)=((ret>>10)&0x03FF)|0xD800; *(presult++)=((ret )&0x03FF)|0xDC00; } } } return result; }


Discussion

Other Formats