aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHimbeer <himbeer@disroot.org>2024-09-11 22:51:44 +0200
committerHimbeer <himbeer@disroot.org>2024-09-11 22:51:44 +0200
commit86d91d38ba462ef35a7960c4b91e2b0c123c9158 (patch)
treecfbccf96de45686cdc4d6a8585f1fc4b0ee51e6f
parent7091ec2009ae8446802ed3c55b174208a53e2999 (diff)
Add lexer for names and identifiers
-rw-r--r--Makefile24
-rw-r--r--README.md2
-rw-r--r--include/lex.h114
-rw-r--r--include/utf8.h15
-rw-r--r--include/util.h9
-rw-r--r--src/lex.c190
-rw-r--r--src/utf8.c124
7 files changed, 467 insertions, 11 deletions
diff --git a/Makefile b/Makefile
index 5a7bd56..afa9407 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,23 @@
.POSIX:
BINOUT = .bin
-HDR = include/lex.h
+HDR = \
+ include/lex.h \
+ include/utf8.h \
+ include/util.h
OBJ = \
src/lex.o \
- src/main.o
+ src/main.o \
+ src/utf8.o
-CFLAGS = -std=c99
+CFLAGS = -Iinclude
+
+all: $(BINOUT)/cerc
+
+$(BINOUT)/cerc: $(OBJ)
+ @mkdir -p -- $(BINOUT)
+ @printf 'CCLD\t%s\n' '$@'
+ @$(CC) $(LDFLAGS) -o $@ $(OBJ)
.SUFFIXES:
.SUFFIXES: .c .o
@@ -17,14 +28,7 @@ src/main.o: $(HDR)
@printf 'CC\t%s\n' '$@'
@$(CC) -c $(CFLAGS) -o $@ $<
-$(BINOUT)/cerc: $(OBJ)
- @mkdir -p -- $(BINOUT)
- @printf 'CCLD\t%s\n' '$@'
- @$(CC) $(LDFLAGS) -o $@ $(OBJ)
-
clean:
@rm -rf -- $(BINOUT) $(OBJ)
-all: $(BINOUT)/cerc
-
.PHONY: clean
diff --git a/README.md b/README.md
index 33f53d6..e41943d 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# cerc
-This is a [Cer](https://himbeerserver.de/md/cer.md) compiler written in C99 for
+This is a [Cer](https://himbeerserver.de/md/cer.md) compiler for
POSIX-compatible systems.
## Building
diff --git a/include/lex.h b/include/lex.h
new file mode 100644
index 0000000..23619d5
--- /dev/null
+++ b/include/lex.h
@@ -0,0 +1,114 @@
+#ifndef CERC_LEX_H
+#define CERC_LEX_H
+#include <stdio.h>
+#include "utf8.h"
+
+#define C_EOF UTF8_INVALID
+
+enum lexical_token {
+ // Keywords
+ T_BREAK,
+ T_CONST,
+ T_CONTINUE,
+ T_DEFER,
+ T_ELSE,
+ T_ENUM,
+ T_EXPORT,
+ T_EXTERN,
+ T_FOR,
+ T_FUNC,
+ T_IF,
+ T_IMPORT,
+ T_LET,
+ T_MUT,
+ T_PUB,
+ T_RETURN,
+ T_STRUCT,
+ T_UNION,
+ T_LAST_KEYWORD,
+
+ // Builtin types
+ T_F32,
+ T_F64,
+ T_I8,
+ T_I16,
+ T_I32,
+ T_I64,
+ T_ISIZE,
+ T_U8,
+ T_U16,
+ T_U32,
+ T_U64,
+ T_USIZE,
+ T_LAST_BUILTIN_TYPE,
+
+ // Operators
+ T_ADD,
+ T_AND,
+ T_BAND,
+ T_BANG,
+ T_BNOT,
+ T_BOR,
+ T_BSHL,
+ T_BSHR,
+ T_BXOR,
+ T_COLON,
+ T_COMMA,
+ T_DIV,
+ T_DOT,
+ T_EQ,
+ T_GE,
+ T_GT,
+ T_LBRACE,
+ T_LBRACKET,
+ T_LE,
+ T_LPAREN,
+ T_LT,
+ T_MINUS,
+ T_MODULO,
+ T_NEQ,
+ T_OR,
+ T_QUESTION,
+ T_RBRACE,
+ T_RBRACKET,
+ T_RPAREN,
+ T_SEMICOLON,
+ T_STAR,
+ T_UNDERSCORE,
+ T_LAST_OPERATOR,
+
+ // Tokens with additional information
+ T_IDENT,
+ T_NAME,
+ T_NUMBER,
+ T_STRING,
+
+ // Magic values
+ T_EOF,
+ T_NONE,
+};
+
+extern const char *tokens[];
+
+struct token {
+ enum lexical_token token;
+ union {
+ const char *str;
+ } info;
+};
+
+struct lexer {
+ FILE *in;
+ uint32_t c[2];
+ char *buf;
+ size_t bufsz, buflen;
+ struct token un;
+};
+
+void lex_init(struct lexer *lexer, FILE *f);
+void lex_finish(struct lexer *lexer);
+
+enum lexical_token lex(struct lexer *lexer, struct token *out);
+void unlex(struct lexer *lexer, const struct token *in);
+
+#endif
diff --git a/include/utf8.h b/include/utf8.h
new file mode 100644
index 0000000..f29dcd3
--- /dev/null
+++ b/include/utf8.h
@@ -0,0 +1,15 @@
+#ifndef CERC_UTF8_H
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#define CERC_UTF8_H
+#define UTF8_MAX_SIZE 4
+#define UTF8_INVALID UINT32_MAX
+
+uint32_t utf8_decode(const char **s);
+size_t utf8_encode(char *s, uint32_t c);
+uint32_t utf8_get(FILE *f);
+
+#endif
diff --git a/include/util.h b/include/util.h
new file mode 100644
index 0000000..2a1e72a
--- /dev/null
+++ b/include/util.h
@@ -0,0 +1,9 @@
+#ifndef CERC_UTIL_H
+#define CERC_UTIL_H
+enum exit_status {
+ /* EXIT_SUCCESS = 0 (defined in stdlib.h) */
+ EXIT_USER = 1,
+ EXIT_ABNORMAL = 255,
+};
+
+#endif
diff --git a/src/lex.c b/src/lex.c
new file mode 100644
index 0000000..f67a213
--- /dev/null
+++ b/src/lex.c
@@ -0,0 +1,190 @@
+#define _POSIX_C_SOURCE 200809L
+
+#include <assert.h>
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "lex.h"
+#include "util.h"
+
+const char *tokens[] = {
+ // Must match enum lexical_token (lex.h)
+ [T_BREAK] = "break",
+ [T_CONST] = "const",
+ [T_CONTINUE] = "continue",
+ [T_DEFER] = "defer",
+ [T_ELSE] = "else",
+ [T_ENUM] = "enum",
+ [T_EXPORT] = "export",
+ [T_EXTERN] = "extern",
+ [T_FOR] = "for",
+ [T_FUNC] = "func",
+ [T_IF] = "if",
+ [T_IMPORT] = "import",
+ [T_LET] = "let",
+ [T_MUT] = "mut",
+ [T_PUB] = "pub",
+ [T_RETURN] = "return",
+ [T_STRUCT] = "struct",
+ [T_UNION] = "union",
+};
+
+void
+lex_init(struct lexer *lexer, FILE *f)
+{
+ lexer->in = f;
+ lexer->c[0] = 0;
+ lexer->c[1] = 0;
+ lexer->buf = malloc(2);
+ lexer->bufsz = 2;
+ lexer->buflen = 0;
+ if (!lexer->buf) {
+ fprintf(stderr, "Out of memory\n");
+ exit(EXIT_ABNORMAL);
+ }
+ lexer->un.token = T_NONE;
+}
+
+void
+lex_finish(struct lexer *lexer)
+{
+ fclose(lexer->in);
+}
+
+static uint32_t next(struct lexer *lexer)
+{
+ if (lexer->c[0]) {
+ lexer->c[0] = 0;
+ return lexer->c[1];
+ }
+
+ uint32_t c = utf8_get(lexer->in);
+ if (c == C_EOF) {
+ return c;
+ }
+
+ lexer->c[1] = c;
+ return c;
+}
+
+static bool
+iscerspace(uint32_t c)
+{
+ return c == '\t' || c == '\n' || c == ' ';
+}
+
+static uint32_t
+wgetc(struct lexer *lexer)
+{
+ uint32_t c;
+ while ((c = next(lexer)) != C_EOF && iscerspace(c)) ;
+ return c;
+}
+
+static void
+unget(struct lexer *lexer)
+{
+ assert(!lexer->c[0]);
+ lexer->c[0] = lexer->c[1];
+}
+
+static void
+extend(struct lexer *lexer, const char *buf, size_t sz)
+{
+ if (lexer->buflen + sz >= lexer->bufsz) {
+ lexer->bufsz *= 2;
+ lexer->buf = realloc(lexer->buf, lexer->bufsz);
+ if (!lexer->buf) {
+ fprintf(stderr, "Out of memory\n");
+ exit(EXIT_ABNORMAL);
+ }
+ }
+ memcpy(&lexer->buf[lexer->buflen], buf, sz);
+ lexer->buflen += sz;
+ lexer->buf[lexer->buflen] = '\0';
+}
+
+static void
+push(struct lexer *lexer, uint32_t c) {
+ char buf[UTF8_MAX_SIZE];
+ size_t sz = utf8_encode(buf, c);
+ extend(lexer, buf, sz);
+}
+
+static void
+clear(struct lexer *lexer)
+{
+ lexer->buflen = 0;
+}
+
+static void *
+search(const char *buf, int bound)
+{
+ for (int i = 0; i < bound; ++i) {
+ if (strcmp(buf, tokens[i]) == 0) {
+ return &tokens[i];
+ }
+ }
+
+ return NULL;
+}
+
+static enum lexical_token
+lex_ident(struct lexer *lexer, struct token *out)
+{
+ bool isident;
+
+ uint32_t c;
+ while ((c = next(lexer)) != C_EOF) {
+ if (c >= 0x7F || (!isalnum(c) && c != '/')) {
+ unget(lexer);
+ break;
+ }
+ if (c == '/') {
+ isident = true;
+ }
+ push(lexer, c);
+ }
+
+ void *token = search(lexer->buf, T_LAST_KEYWORD);
+ if (!token && isident) {
+ out->token = T_IDENT;
+ out->info.str = strdup(lexer->buf);
+ } else if (!token) {
+ out->token = T_NAME;
+ out->info.str = strdup(lexer->buf);
+ } else {
+ out->token = (const char **)token - tokens;
+ }
+ clear(lexer);
+ return out->token;
+}
+
+enum lexical_token
+lex(struct lexer *lexer, struct token *out)
+{
+ if (lexer->un.token != T_NONE) {
+ *out = lexer->un;
+ lexer->un.token = T_NONE;
+ return out->token;
+ }
+
+ uint32_t c = wgetc(lexer);
+ if (c == C_EOF) {
+ out->token = T_EOF;
+ return out->token;
+ }
+
+ if (c <= 0x7F && isalpha(c)) {
+ push(lexer, c);
+ return lex_ident(lexer, out);
+ }
+}
+
+void unlex(struct lexer *lexer, const struct token *in)
+{
+ assert(lexer->un.token == T_NONE);
+ lexer->un = *in;
+}
diff --git a/src/utf8.c b/src/utf8.c
new file mode 100644
index 0000000..99c0cbc
--- /dev/null
+++ b/src/utf8.c
@@ -0,0 +1,124 @@
+#include "utf8.h"
+
+static const uint8_t masks[] = {
+ 0x7F,
+ 0x1F,
+ 0x0F,
+ 0x07,
+ 0x03,
+ 0x01
+};
+
+static const struct {
+ uint8_t mask;
+ uint8_t result;
+ int octets;
+} sizes[] = {
+ { 0x80, 0x00, 1 },
+ { 0xE0, 0xC0, 2 },
+ { 0xF0, 0xE0, 3 },
+ { 0xF8, 0xF0, 4 },
+ { 0xFC, 0xF8, 5 },
+ { 0xFE, 0xFC, 6 },
+ { 0x80, 0x80, -1 },
+};
+
+static int
+utf8_size(uint8_t c)
+{
+ for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]); ++i) {
+ if ((c & sizes[i].mask) == sizes[i].result) {
+ return sizes[i].octets;
+ }
+ }
+ return -1;
+}
+
+uint32_t
+utf8_decode(const char **s)
+{
+ const uint8_t **bytes = (const uint8_t **)s;
+
+ uint32_t cp = 0;
+ if (**s < 0x80) {
+ // Shortcut
+ cp = **bytes;
+ ++*bytes;
+ return cp;
+ }
+ int size = utf8_size(**bytes);
+ if (size == -1) {
+ ++*bytes;
+ return UTF8_INVALID;
+ }
+ uint8_t mask = masks[size - 1];
+ cp = **bytes & mask;
+ ++*bytes;
+ while (--size) {
+ uint8_t c = **bytes;
+
+ ++*bytes;
+
+ if ((c >> 6) != 0x02)
+ return UTF8_INVALID;
+
+ cp <<= 6;
+ cp |= c & 0x3f;
+ }
+ return cp;
+}
+
+size_t
+utf8_encode(char *s, uint32_t c)
+{
+ size_t len = 0;
+ uint8_t first;
+
+ if (c < 0x80) {
+ first = 0;
+ len = 1;
+ } else if (c < 0x800) {
+ first = 0xc0;
+ len = 2;
+ } else if (c < 0x10000) {
+ first = 0xe0;
+ len = 3;
+ } else {
+ first = 0xf0;
+ len = 4;
+ }
+
+ for (size_t i = len - 1; i > 0; --i) {
+ s[i] = (c & 0x3f) | 0x80;
+ c >>= 6;
+ }
+
+ s[0] = c | first;
+ return len;
+}
+
+uint32_t
+utf8_get(FILE *f)
+{
+ char buffer[UTF8_MAX_SIZE];
+ int c = fgetc(f);
+ if (c == EOF) {
+ return UTF8_INVALID;
+ }
+ buffer[0] = (char)c;
+ int size = utf8_size(c);
+
+ if (size > UTF8_MAX_SIZE) {
+ fseek(f, size - 1, SEEK_CUR);
+ return UTF8_INVALID;
+ }
+
+ if (size > 1) {
+ int amt = fread(&buffer[1], 1, size - 1, f);
+ if (amt != size - 1) {
+ return UTF8_INVALID;
+ }
+ }
+ const char *ptr = buffer;
+ return utf8_decode(&ptr);
+}