#define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include "lex.h" #include "util.h" const char *tokens[] = { // Must match enum lexical_token (lex.h) [T_ALIGN] = "align", [T_AS] = "as", [T_BREAK] = "break", [T_CONST] = "const", [T_CONTINUE] = "continue", [T_DEFER] = "defer", [T_ELSE] = "else", [T_ENUM] = "enum", [T_EXTERN] = "extern", [T_FALSE] = "false", [T_FOR] = "for", [T_FUNC] = "func", [T_IF] = "if", [T_INCLUDE] = "include", [T_LET] = "let", [T_PUB] = "pub", [T_RETURN] = "return", [T_STRUCT] = "struct", [T_TAGOF] = "tagof", [T_TRUE] = "true", [T_UNION] = "union", [T_VAR] = "var", [T_BOOL] = "bool", [T_FLOAT32] = "float32", [T_FLOAT64] = "float64", [T_INT8] = "int8", [T_INT16] = "int16", [T_INT32] = "int32", [T_INT64] = "int64", [T_INT] = "int", [T_UINT8] = "uint8", [T_UINT16] = "uint16", [T_UINT32] = "uint32", [T_UINT64] = "uint64", [T_UINT] = "uint", }; void lex_init(struct lexer *lexer, FILE *f, const char *filename) { lexer->in = f; lexer->c[0] = 0; lexer->c[1] = 0; lexer->buf = must_malloc(2); lexer->bufsz = 2; lexer->buflen = 0; lexer->un.token = T_NONE; lexer->loc.file = filename; lexer->loc.line = 1; lexer->loc.column = 0; } void lex_finish(struct lexer *lexer) { fclose(lexer->in); } struct location lex_loc(struct lexer *lexer) { if (lexer->un.token != T_NONE) { return lexer->un.loc; } return lexer->loc; } static uint32_t next(struct lexer *lexer) { if (lexer->c[0]) { lexer->c[0] = 0; return lexer->c[1]; } uint32_t c = utf8_get(lexer->in); if (c == C_EOF) { return c; } if (c == '\n') { ++lexer->loc.line; lexer->loc.column = 0; } else { ++lexer->loc.column; } lexer->c[1] = c; return c; } static bool iscerspace(uint32_t c) { return c == '\t' || c == '\n' || c == ' '; } static uint32_t wgetc(struct lexer *lexer) { uint32_t c; while ((c = next(lexer)) != C_EOF && iscerspace(c)) ; return c; } static void unget(struct lexer *lexer) { assert(!lexer->c[0]); lexer->c[0] = lexer->c[1]; } static void extend(struct lexer *lexer, const char *buf, size_t sz) { if (lexer->buflen + sz >= lexer->bufsz) { lexer->bufsz *= 2; lexer->buf = realloc(lexer->buf, lexer->bufsz); if (!lexer->buf) { fprintf(stderr, "Out of memory\n"); exit(EXIT_ABNORMAL); } } memcpy(&lexer->buf[lexer->buflen], buf, sz); lexer->buflen += sz; lexer->buf[lexer->buflen] = '\0'; } static void push(struct lexer *lexer, uint32_t c) { char buf[UTF8_MAX_SIZE]; size_t sz = utf8_encode(buf, c); extend(lexer, buf, sz); } static void clear(struct lexer *lexer) { lexer->buflen = 0; } static void * search(const char *buf, int bound) { for (int i = 0; i < bound; ++i) { if (tokens[i] && (strcmp(buf, tokens[i]) == 0)) { return &tokens[i]; } } return NULL; } static enum lexical_token lex_name(struct lexer *lexer, struct token *out) { uint32_t c; while ((c = next(lexer)) != C_EOF) { if (c >= 0x7F || !isalnum(c)) { unget(lexer); break; } push(lexer, c); } void *token = search(lexer->buf, T_LAST_BUILTIN_TYPE); if (!token) { out->token = T_NAME; out->info.str = strdup(lexer->buf); } else { out->token = (const char **)token - tokens; } clear(lexer); return out->token; } static enum lexical_token lex_number(struct lexer *lexer, struct token *out) { bool isfloat = false; uint32_t c; while ((c = next(lexer)) != C_EOF) { if (c >= 0x7F || (!isdigit(c) && c != '_' && c != '.' && c != 'x')) { unget(lexer); break; } if (c == '.') { isfloat = true; } if (c != '_') { push(lexer, c); } } out->info.num.isfloat = isfloat; if (isfloat) { out->info.num.value.floatingpt = atof(lexer->buf); } else { out->info.num.value.integer = strtol(lexer->buf, NULL, 0); } clear(lexer); out->token = T_NUMBER; return out->token; } static void lex_char(struct lexer *lexer, struct token *out) { uint32_t c = next(lexer); if (next(lexer) != '\'') { error(lex_loc(lexer), "unclosed or multi-character char literal"); } out->info.chr = c; out->token = T_CHAR; } static enum lexical_token lex_string(struct lexer *lexer, struct token *out, uint32_t delim) { uint32_t c; while ((c = next(lexer)) != C_EOF && c != delim) { push(lexer, c); } out->info.str = strdup(lexer->buf); clear(lexer); out->token = T_STRING; return out->token; } static enum lexical_token lex_comment(struct lexer *lexer, struct token *out) { uint32_t c; if ((c = next(lexer)) != C_EOF && c != ' ') { unget(lexer); } while ((c = next(lexer)) != C_EOF && c != '\n') { push(lexer, c); } out->info.str = strdup(lexer->buf); clear(lexer); out->token = T_COMMENT; return out->token; } enum lexical_token lex(struct lexer *lexer, struct token *out) { while (lex_any(lexer, out) == T_COMMENT); return out->token; } enum lexical_token lex_any(struct lexer *lexer, struct token *out) { if (lexer->un.token != T_NONE) { *out = lexer->un; lexer->un.token = T_NONE; return out->token; } uint32_t c = wgetc(lexer); out->loc = lexer->loc; if (c == C_EOF) { out->token = T_EOF; return out->token; } if (c <= 0x7F && isalpha(c)) { push(lexer, c); return lex_name(lexer, out); } if (c <= 0x7F && isdigit(c)) { push(lexer, c); return lex_number(lexer, out); } uint32_t ch; switch (c) { case '+': if ((ch = wgetc(lexer)) == '+') { out->token = T_INCR; } else if (ch == '=') { out->token = T_ADDASSIGN; } else { unget(lexer); out->token = T_ADD; } break; case '&': if ((ch = wgetc(lexer)) == '&') { out->token = T_AND; } else if (ch == '=') { out->token = T_BANDASSIGN; } else { unget(lexer); out->token = T_BAND; } break; case '!': if (wgetc(lexer) == '=') { out->token = T_NEQ; } else { out->token = T_BANG; } break; case '~': out->token = T_BNOT; break; case '|': if ((ch = wgetc(lexer)) == '|') { out->token = T_OR; } else if (ch == '=') { out->token = T_BORASSIGN; } else { unget(lexer); out->token = T_BOR; } break; case '<': if ((ch = wgetc(lexer)) == '<') { out->token = T_BSHL; } else if (ch == '=') { out->token = T_LE; } else { unget(lexer); out->token = T_LT; } break; case '>': if ((ch = wgetc(lexer)) == '>') { out->token = T_BSHL; } else if (ch == '=') { out->token = T_GE; } else { unget(lexer); out->token = T_GT; } break; case '^': if (wgetc(lexer) == '=') { out->token = T_BXORASSIGN; } else { unget(lexer); out->token = T_BXOR; } break; case ':': if (wgetc(lexer) == ':') { out->token = T_MODDELIM; } else { unget(lexer); out->token = T_COLON; } break; case ',': out->token = T_COMMA; break; case '/': if ((ch = wgetc(lexer)) == '/') { return lex_comment(lexer, out); } else if (ch == '=') { out->token = T_DIVASSIGN; } else { unget(lexer); out->token = T_DIV; } break; case '.': out->token = T_DOT; break; case '=': if (wgetc(lexer) == '=') { out->token = T_EQ; } else { unget(lexer); out->token = T_ASSIGN; } break; case '{': out->token = T_LBRACE; break; case '[': out->token = T_LBRACKET; break; case '(': out->token = T_LPAREN; break; case '-': if ((ch = wgetc(lexer)) == '-') { out->token = T_DECR; } else if (ch == '=') { out->token = T_SUBASSIGN; } else { unget(lexer); out->token = T_MINUS; } break; case '%': if (wgetc(lexer) == '=') { out->token = T_MODASSIGN; } else { unget(lexer); out->token = T_MODULO; } break; case '?': out->token = T_QUESTION; break; case '}': out->token = T_RBRACE; break; case ']': out->token = T_RBRACKET; break; case ')': out->token = T_RPAREN; break; case ';': out->token = T_SEMICOLON; break; case '*': if (wgetc(lexer) == '=') { out->token = T_MULASSIGN; } else { unget(lexer); out->token = T_STAR; } break; case '_': out->token = T_UNDERSCORE; break; case '\'': lex_char(lexer, out); break; case '"': return lex_string(lexer, out, c); default: return T_NONE; } return out->token; } void unlex(struct lexer *lexer, const struct token *in) { assert(lexer->un.token == T_NONE); lexer->un = *in; } bool match(struct lexer *lexer, enum lexical_token token) { struct token tmp; enum lexical_token kind = lex(lexer, &tmp); if (kind != token) { unlex(lexer, &tmp); } return kind == token; }