aboutsummaryrefslogtreecommitdiff
path: root/src/utf8.c
blob: 99c0cbc1a8d5fc14edbfd02824c00240bb5da6f7 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#include "utf8.h"

static const uint8_t masks[] = {
	0x7F,
	0x1F,
	0x0F,
	0x07,
	0x03,
	0x01
};

static const struct {
	uint8_t mask;
	uint8_t result;
	int octets;
} sizes[] = {
	{ 0x80, 0x00, 1 },
	{ 0xE0, 0xC0, 2 },
	{ 0xF0, 0xE0, 3 },
	{ 0xF8, 0xF0, 4 },
	{ 0xFC, 0xF8, 5 },
	{ 0xFE, 0xFC, 6 },
	{ 0x80, 0x80, -1 },
};

static int
utf8_size(uint8_t c)
{
	for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]); ++i) {
		if ((c & sizes[i].mask) == sizes[i].result) {
			return sizes[i].octets;
		}
	}
	return -1;
}

uint32_t
utf8_decode(const char **s)
{
	const uint8_t **bytes = (const uint8_t **)s;

	uint32_t cp = 0;
	if (**s < 0x80) {
		// Shortcut
		cp = **bytes;
		++*bytes;
		return cp;
	}
	int size = utf8_size(**bytes);
	if (size == -1) {
		++*bytes;
		return UTF8_INVALID;
	}
	uint8_t mask = masks[size - 1];
	cp = **bytes & mask;
	++*bytes;
	while (--size) {
		uint8_t c = **bytes;

		++*bytes;

		if ((c >> 6) != 0x02)
			return UTF8_INVALID;

		cp <<= 6;
		cp |= c & 0x3f;
	}
	return cp;
}

size_t
utf8_encode(char *s, uint32_t c)
{
	size_t len = 0;
	uint8_t first;

	if (c < 0x80) {
		first = 0;
		len = 1;
	} else if (c < 0x800) {
		first = 0xc0;
		len = 2;
	} else if (c < 0x10000) {
		first = 0xe0;
		len = 3;
	} else {
		first = 0xf0;
		len = 4;
	}

	for (size_t i = len - 1; i > 0; --i) {
		s[i] = (c & 0x3f) | 0x80;
		c >>= 6;
	}

	s[0] = c | first;
	return len;
}

uint32_t
utf8_get(FILE *f)
{
	char buffer[UTF8_MAX_SIZE];
	int c = fgetc(f);
	if (c == EOF) {
		return UTF8_INVALID;
	}
	buffer[0] = (char)c;
	int size = utf8_size(c);

	if (size > UTF8_MAX_SIZE) {
		fseek(f, size - 1, SEEK_CUR);
		return UTF8_INVALID;
	}

	if (size > 1) {
		int amt = fread(&buffer[1], 1, size - 1, f);
		if (amt != size - 1) {
			return UTF8_INVALID;
		}
	}
	const char *ptr = buffer;
	return utf8_decode(&ptr);
}