diff options
author | Himbeer <himbeer@disroot.org> | 2024-09-11 22:51:44 +0200 |
---|---|---|
committer | Himbeer <himbeer@disroot.org> | 2024-09-11 22:51:44 +0200 |
commit | 86d91d38ba462ef35a7960c4b91e2b0c123c9158 (patch) | |
tree | cfbccf96de45686cdc4d6a8585f1fc4b0ee51e6f | |
parent | 7091ec2009ae8446802ed3c55b174208a53e2999 (diff) |
Add lexer for names and identifiers
-rw-r--r-- | Makefile | 24 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | include/lex.h | 114 | ||||
-rw-r--r-- | include/utf8.h | 15 | ||||
-rw-r--r-- | include/util.h | 9 | ||||
-rw-r--r-- | src/lex.c | 190 | ||||
-rw-r--r-- | src/utf8.c | 124 |
7 files changed, 467 insertions, 11 deletions
@@ -1,12 +1,23 @@ .POSIX: BINOUT = .bin -HDR = include/lex.h +HDR = \ + include/lex.h \ + include/utf8.h \ + include/util.h OBJ = \ src/lex.o \ - src/main.o + src/main.o \ + src/utf8.o -CFLAGS = -std=c99 +CFLAGS = -Iinclude + +all: $(BINOUT)/cerc + +$(BINOUT)/cerc: $(OBJ) + @mkdir -p -- $(BINOUT) + @printf 'CCLD\t%s\n' '$@' + @$(CC) $(LDFLAGS) -o $@ $(OBJ) .SUFFIXES: .SUFFIXES: .c .o @@ -17,14 +28,7 @@ src/main.o: $(HDR) @printf 'CC\t%s\n' '$@' @$(CC) -c $(CFLAGS) -o $@ $< -$(BINOUT)/cerc: $(OBJ) - @mkdir -p -- $(BINOUT) - @printf 'CCLD\t%s\n' '$@' - @$(CC) $(LDFLAGS) -o $@ $(OBJ) - clean: @rm -rf -- $(BINOUT) $(OBJ) -all: $(BINOUT)/cerc - .PHONY: clean @@ -1,6 +1,6 @@ # cerc -This is a [Cer](https://himbeerserver.de/md/cer.md) compiler written in C99 for +This is a [Cer](https://himbeerserver.de/md/cer.md) compiler for POSIX-compatible systems. ## Building diff --git a/include/lex.h b/include/lex.h new file mode 100644 index 0000000..23619d5 --- /dev/null +++ b/include/lex.h @@ -0,0 +1,114 @@ +#ifndef CERC_LEX_H +#define CERC_LEX_H +#include <stdio.h> +#include "utf8.h" + +#define C_EOF UTF8_INVALID + +enum lexical_token { + // Keywords + T_BREAK, + T_CONST, + T_CONTINUE, + T_DEFER, + T_ELSE, + T_ENUM, + T_EXPORT, + T_EXTERN, + T_FOR, + T_FUNC, + T_IF, + T_IMPORT, + T_LET, + T_MUT, + T_PUB, + T_RETURN, + T_STRUCT, + T_UNION, + T_LAST_KEYWORD, + + // Builtin types + T_F32, + T_F64, + T_I8, + T_I16, + T_I32, + T_I64, + T_ISIZE, + T_U8, + T_U16, + T_U32, + T_U64, + T_USIZE, + T_LAST_BUILTIN_TYPE, + + // Operators + T_ADD, + T_AND, + T_BAND, + T_BANG, + T_BNOT, + T_BOR, + T_BSHL, + T_BSHR, + T_BXOR, + T_COLON, + T_COMMA, + T_DIV, + T_DOT, + T_EQ, + T_GE, + T_GT, + T_LBRACE, + T_LBRACKET, + T_LE, + T_LPAREN, + T_LT, + T_MINUS, + T_MODULO, + T_NEQ, + T_OR, + T_QUESTION, + T_RBRACE, + T_RBRACKET, + T_RPAREN, + T_SEMICOLON, + T_STAR, + T_UNDERSCORE, + T_LAST_OPERATOR, + + // Tokens with additional information + T_IDENT, + T_NAME, + T_NUMBER, + T_STRING, + + // Magic values + T_EOF, + T_NONE, +}; + +extern const char *tokens[]; + +struct token { + enum lexical_token token; + union { + const char *str; + } info; +}; + +struct lexer { + FILE *in; + uint32_t c[2]; + char *buf; + size_t bufsz, buflen; + struct token un; +}; + +void lex_init(struct lexer *lexer, FILE *f); +void lex_finish(struct lexer *lexer); + +enum lexical_token lex(struct lexer *lexer, struct token *out); +void unlex(struct lexer *lexer, const struct token *in); + +#endif diff --git a/include/utf8.h b/include/utf8.h new file mode 100644 index 0000000..f29dcd3 --- /dev/null +++ b/include/utf8.h @@ -0,0 +1,15 @@ +#ifndef CERC_UTF8_H +#include <limits.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + +#define CERC_UTF8_H +#define UTF8_MAX_SIZE 4 +#define UTF8_INVALID UINT32_MAX + +uint32_t utf8_decode(const char **s); +size_t utf8_encode(char *s, uint32_t c); +uint32_t utf8_get(FILE *f); + +#endif diff --git a/include/util.h b/include/util.h new file mode 100644 index 0000000..2a1e72a --- /dev/null +++ b/include/util.h @@ -0,0 +1,9 @@ +#ifndef CERC_UTIL_H +#define CERC_UTIL_H +enum exit_status { + /* EXIT_SUCCESS = 0 (defined in stdlib.h) */ + EXIT_USER = 1, + EXIT_ABNORMAL = 255, +}; + +#endif diff --git a/src/lex.c b/src/lex.c new file mode 100644 index 0000000..f67a213 --- /dev/null +++ b/src/lex.c @@ -0,0 +1,190 @@ +#define _POSIX_C_SOURCE 200809L + +#include <assert.h> +#include <ctype.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "lex.h" +#include "util.h" + +const char *tokens[] = { + // Must match enum lexical_token (lex.h) + [T_BREAK] = "break", + [T_CONST] = "const", + [T_CONTINUE] = "continue", + [T_DEFER] = "defer", + [T_ELSE] = "else", + [T_ENUM] = "enum", + [T_EXPORT] = "export", + [T_EXTERN] = "extern", + [T_FOR] = "for", + [T_FUNC] = "func", + [T_IF] = "if", + [T_IMPORT] = "import", + [T_LET] = "let", + [T_MUT] = "mut", + [T_PUB] = "pub", + [T_RETURN] = "return", + [T_STRUCT] = "struct", + [T_UNION] = "union", +}; + +void +lex_init(struct lexer *lexer, FILE *f) +{ + lexer->in = f; + lexer->c[0] = 0; + lexer->c[1] = 0; + lexer->buf = malloc(2); + lexer->bufsz = 2; + lexer->buflen = 0; + if (!lexer->buf) { + fprintf(stderr, "Out of memory\n"); + exit(EXIT_ABNORMAL); + } + lexer->un.token = T_NONE; +} + +void +lex_finish(struct lexer *lexer) +{ + fclose(lexer->in); +} + +static uint32_t next(struct lexer *lexer) +{ + if (lexer->c[0]) { + lexer->c[0] = 0; + return lexer->c[1]; + } + + uint32_t c = utf8_get(lexer->in); + if (c == C_EOF) { + return c; + } + + lexer->c[1] = c; + return c; +} + +static bool +iscerspace(uint32_t c) +{ + return c == '\t' || c == '\n' || c == ' '; +} + +static uint32_t +wgetc(struct lexer *lexer) +{ + uint32_t c; + while ((c = next(lexer)) != C_EOF && iscerspace(c)) ; + return c; +} + +static void +unget(struct lexer *lexer) +{ + assert(!lexer->c[0]); + lexer->c[0] = lexer->c[1]; +} + +static void +extend(struct lexer *lexer, const char *buf, size_t sz) +{ + if (lexer->buflen + sz >= lexer->bufsz) { + lexer->bufsz *= 2; + lexer->buf = realloc(lexer->buf, lexer->bufsz); + if (!lexer->buf) { + fprintf(stderr, "Out of memory\n"); + exit(EXIT_ABNORMAL); + } + } + memcpy(&lexer->buf[lexer->buflen], buf, sz); + lexer->buflen += sz; + lexer->buf[lexer->buflen] = '\0'; +} + +static void +push(struct lexer *lexer, uint32_t c) { + char buf[UTF8_MAX_SIZE]; + size_t sz = utf8_encode(buf, c); + extend(lexer, buf, sz); +} + +static void +clear(struct lexer *lexer) +{ + lexer->buflen = 0; +} + +static void * +search(const char *buf, int bound) +{ + for (int i = 0; i < bound; ++i) { + if (strcmp(buf, tokens[i]) == 0) { + return &tokens[i]; + } + } + + return NULL; +} + +static enum lexical_token +lex_ident(struct lexer *lexer, struct token *out) +{ + bool isident; + + uint32_t c; + while ((c = next(lexer)) != C_EOF) { + if (c >= 0x7F || (!isalnum(c) && c != '/')) { + unget(lexer); + break; + } + if (c == '/') { + isident = true; + } + push(lexer, c); + } + + void *token = search(lexer->buf, T_LAST_KEYWORD); + if (!token && isident) { + out->token = T_IDENT; + out->info.str = strdup(lexer->buf); + } else if (!token) { + out->token = T_NAME; + out->info.str = strdup(lexer->buf); + } else { + out->token = (const char **)token - tokens; + } + clear(lexer); + return out->token; +} + +enum lexical_token +lex(struct lexer *lexer, struct token *out) +{ + if (lexer->un.token != T_NONE) { + *out = lexer->un; + lexer->un.token = T_NONE; + return out->token; + } + + uint32_t c = wgetc(lexer); + if (c == C_EOF) { + out->token = T_EOF; + return out->token; + } + + if (c <= 0x7F && isalpha(c)) { + push(lexer, c); + return lex_ident(lexer, out); + } +} + +void unlex(struct lexer *lexer, const struct token *in) +{ + assert(lexer->un.token == T_NONE); + lexer->un = *in; +} diff --git a/src/utf8.c b/src/utf8.c new file mode 100644 index 0000000..99c0cbc --- /dev/null +++ b/src/utf8.c @@ -0,0 +1,124 @@ +#include "utf8.h" + +static const uint8_t masks[] = { + 0x7F, + 0x1F, + 0x0F, + 0x07, + 0x03, + 0x01 +}; + +static const struct { + uint8_t mask; + uint8_t result; + int octets; +} sizes[] = { + { 0x80, 0x00, 1 }, + { 0xE0, 0xC0, 2 }, + { 0xF0, 0xE0, 3 }, + { 0xF8, 0xF0, 4 }, + { 0xFC, 0xF8, 5 }, + { 0xFE, 0xFC, 6 }, + { 0x80, 0x80, -1 }, +}; + +static int +utf8_size(uint8_t c) +{ + for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]); ++i) { + if ((c & sizes[i].mask) == sizes[i].result) { + return sizes[i].octets; + } + } + return -1; +} + +uint32_t +utf8_decode(const char **s) +{ + const uint8_t **bytes = (const uint8_t **)s; + + uint32_t cp = 0; + if (**s < 0x80) { + // Shortcut + cp = **bytes; + ++*bytes; + return cp; + } + int size = utf8_size(**bytes); + if (size == -1) { + ++*bytes; + return UTF8_INVALID; + } + uint8_t mask = masks[size - 1]; + cp = **bytes & mask; + ++*bytes; + while (--size) { + uint8_t c = **bytes; + + ++*bytes; + + if ((c >> 6) != 0x02) + return UTF8_INVALID; + + cp <<= 6; + cp |= c & 0x3f; + } + return cp; +} + +size_t +utf8_encode(char *s, uint32_t c) +{ + size_t len = 0; + uint8_t first; + + if (c < 0x80) { + first = 0; + len = 1; + } else if (c < 0x800) { + first = 0xc0; + len = 2; + } else if (c < 0x10000) { + first = 0xe0; + len = 3; + } else { + first = 0xf0; + len = 4; + } + + for (size_t i = len - 1; i > 0; --i) { + s[i] = (c & 0x3f) | 0x80; + c >>= 6; + } + + s[0] = c | first; + return len; +} + +uint32_t +utf8_get(FILE *f) +{ + char buffer[UTF8_MAX_SIZE]; + int c = fgetc(f); + if (c == EOF) { + return UTF8_INVALID; + } + buffer[0] = (char)c; + int size = utf8_size(c); + + if (size > UTF8_MAX_SIZE) { + fseek(f, size - 1, SEEK_CUR); + return UTF8_INVALID; + } + + if (size > 1) { + int amt = fread(&buffer[1], 1, size - 1, f); + if (amt != size - 1) { + return UTF8_INVALID; + } + } + const char *ptr = buffer; + return utf8_decode(&ptr); +} |