summaryrefslogtreecommitdiffstats
path: root/squirrel_3_0_1_stable/sqplus/SqPlusUtf8.cpp
blob: 7505e9928e1a5b64a6045490ea47225454989ac8 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

///////////////////////////////////////////////////////////////////////
// Simple conversion routines, to, from UTF8 and full Unicode character
// (using int). 
// 

// Only when needed 
#if !defined(SQPLUS_USE_LATIN1) || SQPLUS_USE_LATIN1==0

static char g_utf8_length[256];
static int g_did_init_length; 

void sqplus_init_utf8_lengths() {
    // Fill in lengths in array above
    for( int lb=0; lb<256; lb++ ){
        int l = -1;
        if( !(lb&0x80) ) l=1;
        else if( (lb&0xE0)==0xC0 ) l=2;
        else if( (lb&0xF0)==0xE0 ) l=3;
        else if( (lb&0xF8)==0xF0 ) l=4;
        else if( (lb&0xFC)==0xF8 ) l=5;
        else if( (lb&0xFE)==0xFC ) l=6;
        g_utf8_length[lb] = l;
    }
    g_did_init_length = 1;
}

// Length of a UTF8 encoded Unicode character
int sqplus_utf8_len(int lead_byte){
    if( !(lead_byte&0x80) ) return 1;  // Special case, make faster
    if( !g_did_init_length )
        sqplus_init_utf8_lengths();
        
    return g_utf8_length[(unsigned char)lead_byte];
}

int sqplus_utf8_len_first(const char* pc){
    int lb = *(unsigned char*)pc;
    if( !(lb&0x80) ) return 1;  // Special case, make faster
    if( !g_did_init_length )
        sqplus_init_utf8_lengths();

    int l = g_utf8_length[(unsigned char)lb];
    if( l>0 ) return l;
    
    // Invalid UTF8 lead byte. Look for next valid character. 
    const char *pc1 = pc+1;
    while( ((*pc1)&0xC0)==0x80 ) 
        pc1++;
    return int(pc1 - pc);
}


// Length of a UTF8 encoded Unicode string (number of Unicode characters)
int sqplus_utf8_strlen(const char *str) {
    if( !str ) return 0;
    int l, tl=0;
    while( *str ){
        l = sqplus_utf8_len_first(str);
        str += l;
        tl++;
    }
    return tl;
}

// Convert one UTF8 encoded character to Unicode point
int sqplus_utf8_to_wchar(int *result, const char *string){
    int res=-1;
    
    // Assume argument pointers to be OK
    unsigned char ch = *string;
    int l = sqplus_utf8_len(ch);
    
    if( l<1  ) return -1;
    int wc = l>1 ? (ch&(0x7F>>l)) : ch;
    while( l>1 ){
        wc = (wc<<6) + (*++string & 0x3F);
        l--;
    }
    *result = wc;
    
    return 0;
}

// Convert one Unicode point to UTF8 encoded version.
// Checks if output fits in 1/4/6 bytes buffer.
int sqplus_wchar_to_utf8(char *s, int wc, int size){
    if( size<1 ) return -1;
    if( (unsigned int)wc>=0x80000000 ) return -2;
    
    // Single byte case
    if( wc<0x80 ){
        *s = (char)wc;
        //*s = (char)wc&0x7F;
        return 1;
    }
    if( size<4 ) return -3;

    // Two or more UTF8 bytes
    int p = 1;  // Index of last UTF8 byte
    if( wc>0x7FF ){ // 11 bits
        // Three or more UTF8 bytes
        p++; // p>=2
        if( wc>0xFFFF ){    // 16 bits
            // Four or more UTF8 bytes
            p++; // p>=3
            if( wc>0x1FFFFF ){  // 21 bits
                // Five or more UTF8 bytes
                if( size<6 ) return -3;
                p++; // p>=4 UTF8 bytes
                if( wc>0x3FFFFFF ){ // 26 bits
                    // Six UTF8 bytes
                    p++; // p>=5 
                    if( (unsigned int)wc>(unsigned int)0x7FFFFFF ){  // 31 bits
						// Would need 7 UTF8 bytes. Not supported.
						return -10;
                    }
                    s[p-4] = 0x80 | ((wc>>24)&0x3F);    // Bit 24..29
                }
                s[p-3] = 0x80 | ((wc>>18)&0x3F);    // Bit 18..23
            }
            s[p-2] = 0x80 | ((wc>>12)&0x3F);    // Bit 12..17
        }
        s[p-1] = 0x80 | ((wc>>6)&0x3F);    // Bit 6..11
    }
    s[p] = 0x80 | (wc&0x3F);    // Bit 0..5
    s[0] = (0xFC << (5-p)) | (wc>>(p*6));
    
    return p+1;
}

#endif // #if !defined(SQPLUS_USE_LATIN1) || SQPLUS_USE_LATIN1==0