1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
#include "utf8.h"
static const uint8_t masks[] = {
0x7F,
0x1F,
0x0F,
0x07,
0x03,
0x01
};
static const struct {
uint8_t mask;
uint8_t result;
int octets;
} sizes[] = {
{ 0x80, 0x00, 1 },
{ 0xE0, 0xC0, 2 },
{ 0xF0, 0xE0, 3 },
{ 0xF8, 0xF0, 4 },
{ 0xFC, 0xF8, 5 },
{ 0xFE, 0xFC, 6 },
{ 0x80, 0x80, -1 },
};
static int
utf8_size(uint8_t c)
{
for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]); ++i) {
if ((c & sizes[i].mask) == sizes[i].result) {
return sizes[i].octets;
}
}
return -1;
}
uint32_t
utf8_decode(const char **s)
{
const uint8_t **bytes = (const uint8_t **)s;
uint32_t cp = 0;
if (**s < 0x80) {
// Shortcut
cp = **bytes;
++*bytes;
return cp;
}
int size = utf8_size(**bytes);
if (size == -1) {
++*bytes;
return UTF8_INVALID;
}
uint8_t mask = masks[size - 1];
cp = **bytes & mask;
++*bytes;
while (--size) {
uint8_t c = **bytes;
++*bytes;
if ((c >> 6) != 0x02)
return UTF8_INVALID;
cp <<= 6;
cp |= c & 0x3f;
}
return cp;
}
size_t
utf8_encode(char *s, uint32_t c)
{
size_t len = 0;
uint8_t first;
if (c < 0x80) {
first = 0;
len = 1;
} else if (c < 0x800) {
first = 0xc0;
len = 2;
} else if (c < 0x10000) {
first = 0xe0;
len = 3;
} else {
first = 0xf0;
len = 4;
}
for (size_t i = len - 1; i > 0; --i) {
s[i] = (c & 0x3f) | 0x80;
c >>= 6;
}
s[0] = c | first;
return len;
}
uint32_t
utf8_get(FILE *f)
{
char buffer[UTF8_MAX_SIZE];
int c = fgetc(f);
if (c == EOF) {
return UTF8_INVALID;
}
buffer[0] = (char)c;
int size = utf8_size(c);
if (size > UTF8_MAX_SIZE) {
fseek(f, size - 1, SEEK_CUR);
return UTF8_INVALID;
}
if (size > 1) {
int amt = fread(&buffer[1], 1, size - 1, f);
if (amt != size - 1) {
return UTF8_INVALID;
}
}
const char *ptr = buffer;
return utf8_decode(&ptr);
}
|