gccrs: Add Lexer for Rust front-end

The lexer is referred to as a ManagedTokenSource within the parser. This lexer does not currently support Unicode, but serves as a starting point to do so. gcc/rust/ * lex/rust-codepoint.h: New. * lex/rust-lex.cc: New. * lex/rust-lex.h: New. * lex/rust-token.cc: New. * lex/rust-token.h: New. * rust-buffered-queue.h: New. Co-authored-by: Philip Herron <philip.herron@embecosm.com> Co-authored-by: Arthur Cohen <arthur.cohen@embecosm.com> Co-authored-by: Mark Wielaard <mark@klomp.org> Signed-off-by: Joel Phillips <simplytheother@gmail.com>
2026-02-22 20:01:22 -05:00 · 2022-08-23 16:11:00 +01:00
parent 5b981e9c74
commit 18f6990f84
6 changed files with 3831 additions and 0 deletions
--- a/gcc/rust/lex/rust-codepoint.h
+++ b/gcc/rust/lex/rust-codepoint.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2020-2022 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef RUST_CODEPOINT_H
+#define RUST_CODEPOINT_H
+
+#include "rust-system.h"
+
+namespace Rust {
+struct Codepoint
+{
+  uint32_t value;
+
+  // Creates a zero codepoint.
+  Codepoint () : value (0) {}
+
+  // Creates a codepoint from an encoded UTF-8 value.
+  Codepoint (uint32_t value) : value (value) {}
+
+  static Codepoint eof () { return Codepoint (UINT32_MAX); }
+  bool is_eof () const { return value == UINT32_MAX; }
+
+  // Returns a C++ string containing string value of codepoint.
+  std::string as_string ();
+
+  bool operator== (Codepoint other) const { return value == other.value; }
+  bool operator!= (Codepoint other) const { return !operator== (other); }
+};
+} // namespace Rust
+
+#endif
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -0,0 +1,271 @@
+// Copyright (C) 2020-2022 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef RUST_LEX_H
+#define RUST_LEX_H
+
+#include "rust-linemap.h"
+#include "rust-buffered-queue.h"
+#include "rust-token.h"
+
+namespace Rust {
+// Simple wrapper for FILE* that simplifies destruction.
+struct RAIIFile
+{
+private:
+  FILE *file;
+  const char *filename;
+
+  void close ()
+  {
+    if (file != nullptr && file != stdin)
+      fclose (file);
+  }
+
+public:
+  RAIIFile (const char *filename) : filename (filename)
+  {
+    if (strcmp (filename, "-") == 0)
+      file = stdin;
+    else
+      file = fopen (filename, "r");
+  }
+
+  /**
+   * Create a RAIIFile from an existing instance of FILE*
+   */
+  RAIIFile (FILE *raw, const char *filename = nullptr)
+    : file (raw), filename (filename)
+  {}
+
+  RAIIFile (const RAIIFile &other) = delete;
+  RAIIFile &operator= (const RAIIFile &other) = delete;
+
+  // have to specify setting file to nullptr, otherwise unintended fclose occurs
+  RAIIFile (RAIIFile &&other) : file (other.file), filename (other.filename)
+  {
+    other.file = nullptr;
+  }
+
+  RAIIFile &operator= (RAIIFile &&other)
+  {
+    close ();
+    file = other.file;
+    filename = other.filename;
+    other.file = nullptr;
+
+    return *this;
+  }
+
+  static RAIIFile create_error () { return RAIIFile (nullptr, nullptr); }
+
+  ~RAIIFile () { close (); }
+
+  FILE *get_raw () { return file; }
+  const char *get_filename () { return filename; }
+
+  bool ok () const { return file; }
+};
+
+class Lexer
+{
+private:
+  // Request new Location for current column in line_table
+  Location get_current_location ();
+
+  // Skips the current input char.
+  void skip_input ();
+  // Advances current input char to n + 1 chars ahead of current position.
+  void skip_input (int n);
+
+  // Returns char n chars ahead of current position.
+  int peek_input ();
+  // Peeks the current char.
+  int peek_input (int n);
+
+  // Classifies keyword (i.e. gets id for keyword).
+  TokenId classify_keyword (const std::string &str);
+
+  // Builds a token from the input queue.
+  TokenPtr build_token ();
+
+  std::tuple<std::string, int, bool> parse_in_decimal ();
+  std::pair<std::string, int> parse_in_exponent_part ();
+  std::pair<PrimitiveCoreType, int> parse_in_type_suffix ();
+  std::tuple<char, int, bool> parse_escape (char opening_char);
+  std::tuple<Codepoint, int, bool> parse_utf8_escape (char opening_char);
+  int parse_partial_string_continue ();
+  std::pair<long, int> parse_partial_hex_escape ();
+  std::pair<Codepoint, int> parse_partial_unicode_escape ();
+
+  int get_input_codepoint_length ();
+  int test_get_input_codepoint_n_length (int n_start_offset);
+  Codepoint peek_codepoint_input ();
+  Codepoint test_peek_codepoint_input (int n);
+  void skip_codepoint_input ();
+  void skip_broken_string_input (int current_char);
+
+  TokenPtr parse_byte_char (Location loc);
+  TokenPtr parse_byte_string (Location loc);
+  TokenPtr parse_raw_byte_string (Location loc);
+  TokenPtr parse_raw_identifier (Location loc);
+  TokenPtr parse_string (Location loc);
+  TokenPtr maybe_parse_raw_string (Location loc);
+  TokenPtr parse_raw_string (Location loc, int initial_hash_count);
+  TokenPtr parse_non_decimal_int_literals (Location loc);
+  TokenPtr parse_decimal_int_or_float (Location loc);
+  TokenPtr parse_char_or_lifetime (Location loc);
+  TokenPtr parse_identifier_or_keyword (Location loc);
+
+  template <typename IsDigitFunc>
+  TokenPtr parse_non_decimal_int_literal (Location loc,
+					  IsDigitFunc is_digit_func,
+					  std::string existent_str, int base);
+
+public:
+  // Construct lexer with input file and filename provided
+  Lexer (const char *filename, RAIIFile input, Linemap *linemap);
+
+  // Lex the contents of a string instead of a file
+  Lexer (const std::string &input);
+
+  // dtor
+  ~Lexer ();
+
+  // don't allow copy semantics (for now, at least)
+  Lexer (const Lexer &other) = delete;
+  Lexer &operator= (const Lexer &other) = delete;
+
+  // enable move semantics
+  Lexer (Lexer &&other) = default;
+  Lexer &operator= (Lexer &&other) = default;
+
+  // Returns token n tokens ahead of current position.
+  const_TokenPtr peek_token (int n) { return token_queue.peek (n); }
+  // Peeks the current token.
+  const_TokenPtr peek_token () { return peek_token (0); }
+
+  // Advances current token to n + 1 tokens ahead of current position.
+  void skip_token (int n) { token_queue.skip (n); }
+  // Skips the current token.
+  void skip_token () { skip_token (0); }
+
+  // Replaces the current token with a specified token.
+  void replace_current_token (TokenPtr replacement);
+  // FIXME: don't use anymore
+
+  /* Splits the current token into two. Intended for use with nested generics
+   * closes (i.e. T<U<X>> where >> is wrongly lexed as one token). Note that
+   * this will only work with "simple" tokens like punctuation. */
+  void split_current_token (TokenId new_left, TokenId new_right);
+
+  Linemap *get_line_map () { return line_map; }
+  std::string get_filename () { return std::string (input.get_filename ()); }
+
+private:
+  void start_line (int current_line, int current_column);
+
+  // File for use as input.
+  RAIIFile input;
+  // TODO is this actually required? could just have file storage in InputSource
+
+  // Current line number.
+  int current_line;
+  // Current column number.
+  int current_column;
+  // Current character.
+  int current_char;
+  // Line map.
+  Linemap *line_map;
+
+  /* Max column number that can be quickly allocated - higher may require
+   * allocating new linemap */
+  static const int max_column_hint = 80;
+
+  // Input source wrapper thing.
+  class InputSource
+  {
+  public:
+    virtual ~InputSource () {}
+
+    // Overload operator () to return next char from input stream.
+    virtual int next () = 0;
+  };
+
+  class FileInputSource : public InputSource
+  {
+  private:
+    // Input source file.
+    FILE *input;
+
+  public:
+    // Create new input source from file.
+    FileInputSource (FILE *input) : input (input) {}
+
+    int next () override { return fgetc (input); }
+  };
+
+  class BufferInputSource : public InputSource
+  {
+  private:
+    const std::string &buffer;
+    size_t offs;
+
+  public:
+    // Create new input source from file.
+    BufferInputSource (const std::string &b, size_t offset)
+      : buffer (b), offs (offset)
+    {}
+
+    int next () override
+    {
+      if (offs >= buffer.size ())
+	return EOF;
+
+      return buffer.at (offs++);
+    }
+  };
+
+  // The input source for the lexer.
+  // InputSource input_source;
+  // Input file queue.
+  std::unique_ptr<InputSource> raw_input_source;
+  buffered_queue<int, InputSource &> input_queue;
+
+  // Token source wrapper thing.
+  struct TokenSource
+  {
+    // The lexer object that will use this TokenSource.
+    Lexer *lexer;
+
+    // Create a new TokenSource with given lexer.
+    TokenSource (Lexer *parLexer) : lexer (parLexer) {}
+
+    // Overload operator () to build token in lexer.
+    TokenPtr next () { return lexer->build_token (); }
+  };
+
+  // The token source for the lexer.
+  // TokenSource token_source;
+  // Token stream queue.
+  buffered_queue<std::shared_ptr<Token>, TokenSource> token_queue;
+};
+
+} // namespace Rust
+
+#endif
--- a/gcc/rust/lex/rust-token.cc
+++ b/gcc/rust/lex/rust-token.cc
@@ -0,0 +1,134 @@
+// Copyright (C) 2020-2022 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+#include "rust-token.h"
+#include "rust-diagnostics.h"
+
+namespace Rust {
+// Hackily defined way to get token description for enum value using x-macros
+const char *
+get_token_description (TokenId id)
+{
+  switch (id)
+    {
+#define RS_TOKEN(name, descr)                                                  \
+  case name:                                                                   \
+    return descr;
+#define RS_TOKEN_KEYWORD(x, y) RS_TOKEN (x, y)
+      RS_TOKEN_LIST
+#undef RS_TOKEN_KEYWORD
+#undef RS_TOKEN
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Hackily defined way to get token description as a string for enum value using
+ * x-macros */
+const char *
+token_id_to_str (TokenId id)
+{
+  switch (id)
+    {
+#define RS_TOKEN(name, _)                                                      \
+  case name:                                                                   \
+    return #name;
+#define RS_TOKEN_KEYWORD(x, y) RS_TOKEN (x, y)
+      RS_TOKEN_LIST
+#undef RS_TOKEN_KEYWORD
+#undef RS_TOKEN
+    default:
+      gcc_unreachable ();
+    }
+}
+
+const char *
+get_type_hint_string (PrimitiveCoreType type)
+{
+  switch (type)
+    {
+    case CORETYPE_BOOL:
+      return "bool";
+    case CORETYPE_CHAR:
+      return "char";
+    case CORETYPE_STR:
+      return "str";
+    // case CORETYPE_INT:
+    case CORETYPE_ISIZE:
+      return "isize";
+    // case CORETYPE_UINT:
+    case CORETYPE_USIZE:
+      return "usize";
+    case CORETYPE_F32:
+      return "f32";
+    case CORETYPE_F64:
+      return "f64";
+    case CORETYPE_I8:
+      return "i8";
+    case CORETYPE_I16:
+      return "i16";
+    case CORETYPE_I32:
+      return "i32";
+    case CORETYPE_I64:
+      return "i64";
+    case CORETYPE_I128:
+      return "i128";
+    case CORETYPE_U8:
+      return "u8";
+    case CORETYPE_U16:
+      return "u16";
+    case CORETYPE_U32:
+      return "u32";
+    case CORETYPE_U64:
+      return "u64";
+    case CORETYPE_U128:
+      return "u128";
+    case CORETYPE_PURE_DECIMAL:
+      return "pure_decimal";
+    case CORETYPE_UNKNOWN:
+    default:
+      return "unknown";
+    }
+}
+
+const char *
+Token::get_type_hint_str () const
+{
+  return get_type_hint_string (type_hint);
+}
+
+const std::string &
+Token::get_str () const
+{
+  // FIXME: attempt to return null again
+  // gcc_assert(str != NULL);
+
+  // HACK: allow referencing an empty string
+  static const std::string empty = "";
+
+  if (str == NULL)
+    {
+      rust_error_at (get_locus (),
+		     "attempted to get string for %<%s%>, which has no string. "
+		     "returning empty string instead",
+		     get_token_description ());
+      return empty;
+    }
+  return *str;
+}
+} // namespace Rust
--- a/gcc/rust/lex/rust-token.h
+++ b/gcc/rust/lex/rust-token.h
@@ -0,0 +1,448 @@
+// Copyright (C) 2020-2022 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef RUST_TOKEN_H
+#define RUST_TOKEN_H
+
+#include "rust-system.h"
+#include "rust-linemap.h"
+#include "rust-codepoint.h"
+
+namespace Rust {
+// "Primitive core types" in Rust - the different int and float types, as well
+// as some others
+enum PrimitiveCoreType
+{
+  CORETYPE_UNKNOWN,
+  // named primitives
+  CORETYPE_BOOL,
+  CORETYPE_CHAR,
+  CORETYPE_STR,
+  // okay technically int and uint are arch-dependent (pointer size)
+  CORETYPE_INT,
+  CORETYPE_UINT,
+  // numbered number primitives
+  CORETYPE_F32,
+  CORETYPE_F64,
+  CORETYPE_I8,
+  CORETYPE_I16,
+  CORETYPE_I32,
+  CORETYPE_I64,
+  CORETYPE_I128,
+  CORETYPE_U8,
+  CORETYPE_U16,
+  CORETYPE_U32,
+  CORETYPE_U64,
+  CORETYPE_U128,
+  // Pure decimals are used for tuple index.
+  // Also means there is no type hint.
+  CORETYPE_PURE_DECIMAL,
+  // arch-dependent pointer sizes
+  CORETYPE_ISIZE = CORETYPE_INT,
+  CORETYPE_USIZE = CORETYPE_UINT
+};
+
+// RS_TOKEN(name, description)
+// RS_TOKEN_KEYWORD(name, identifier)
+//
+// Keep RS_TOKEN_KEYWORD sorted
+
+/* note that abstract, async, become, box, do, final, macro, override, priv,
+ * try, typeof, unsized, virtual, and yield are unused */
+#define RS_TOKEN_LIST                                                          \
+  RS_TOKEN (FIRST_TOKEN, "<first-token-marker>")                               \
+  RS_TOKEN (END_OF_FILE, "end of file")                                        \
+  RS_TOKEN (EXCLAM, "!")                                                       \
+  RS_TOKEN (NOT_EQUAL, "!=")                                                   \
+  RS_TOKEN (PERCENT, "%")                                                      \
+  RS_TOKEN (PERCENT_EQ, "%=")                                                  \
+  RS_TOKEN (AMP, "&")                                                          \
+  RS_TOKEN (AMP_EQ, "&=")                                                      \
+  RS_TOKEN (LOGICAL_AND, "&&")                                                 \
+  RS_TOKEN (ASTERISK, "*")                                                     \
+  RS_TOKEN (ASTERISK_EQ, "*=")                                                 \
+  RS_TOKEN (PLUS, "+")                                                         \
+  RS_TOKEN (PLUS_EQ, "+=")                                                     \
+  RS_TOKEN (COMMA, ",")                                                        \
+  RS_TOKEN (MINUS, "-")                                                        \
+  RS_TOKEN (MINUS_EQ, "-=")                                                    \
+  RS_TOKEN (RETURN_TYPE, "->")                                                 \
+  RS_TOKEN (DOT, ".")                                                          \
+  RS_TOKEN (DOT_DOT, "..")                                                     \
+  RS_TOKEN (DOT_DOT_EQ, "..=")                                                 \
+  RS_TOKEN (ELLIPSIS, "...")                                                   \
+  RS_TOKEN (DIV, "/")                                                          \
+  RS_TOKEN (DIV_EQ, "/=")                                                      \
+  RS_TOKEN (COLON, ":")                                                        \
+  RS_TOKEN (SEMICOLON, ";")                                                    \
+  RS_TOKEN (LEFT_SHIFT, "<<")                                                  \
+  RS_TOKEN (LEFT_SHIFT_EQ, "<<=")                                              \
+  RS_TOKEN (LEFT_ANGLE, "<")                                                   \
+  RS_TOKEN (LESS_OR_EQUAL, "<=")                                               \
+  RS_TOKEN (EQUAL, "=")                                                        \
+  RS_TOKEN (EQUAL_EQUAL, "==")                                                 \
+  RS_TOKEN (MATCH_ARROW, "=>")                                                 \
+  RS_TOKEN (RIGHT_ANGLE, ">")                                                  \
+  RS_TOKEN (GREATER_OR_EQUAL, ">=")                                            \
+  RS_TOKEN (RIGHT_SHIFT, ">>")                                                 \
+  RS_TOKEN (RIGHT_SHIFT_EQ, ">>=")                                             \
+  RS_TOKEN (PATTERN_BIND, "@")                                                 \
+  RS_TOKEN (TILDE, "~")                                                        \
+  RS_TOKEN (BACKSLASH, "\\")                                                   \
+  RS_TOKEN (BACKTICK, "`")                                                     \
+  RS_TOKEN (CARET, "^")                                                        \
+  RS_TOKEN (CARET_EQ, "^=")                                                    \
+  RS_TOKEN (PIPE, "|")                                                         \
+  RS_TOKEN (PIPE_EQ, "|=")                                                     \
+  RS_TOKEN (OR, "||")                                                          \
+  RS_TOKEN (QUESTION_MARK, "?")                                                \
+  RS_TOKEN (HASH, "#")                                                         \
+  /* from here on, dodgy and may not be correct. not operators and may be      \
+   * symbols */                                                                \
+  /* RS_TOKEN(SPACE, " ") probably too dodgy */                                \
+  /* RS_TOKEN(NEWLINE, "\n")*/                                                 \
+  RS_TOKEN (SCOPE_RESOLUTION, "::") /* dodgy */                                \
+  RS_TOKEN (SINGLE_QUOTE, "'") /* should i differentiate from lifetime? */     \
+  RS_TOKEN (DOUBLE_QUOTE, "\"")                                                \
+  RS_TOKEN (UNDERSCORE,                                                        \
+	    "_") /* TODO: treat as reserved word like mrustc instead? */       \
+  RS_TOKEN (IDENTIFIER, "identifier")                                          \
+  RS_TOKEN (INT_LITERAL,                                                       \
+	    "integer literal") /* do different int and float types need        \
+				  different literal types? */                  \
+  RS_TOKEN (FLOAT_LITERAL, "float literal")                                    \
+  RS_TOKEN (STRING_LITERAL, "string literal")                                  \
+  RS_TOKEN (CHAR_LITERAL, "character literal")                                 \
+  RS_TOKEN (BYTE_STRING_LITERAL, "byte string literal")                        \
+  RS_TOKEN (BYTE_CHAR_LITERAL, "byte character literal")                       \
+  RS_TOKEN (LIFETIME, "lifetime") /* TODO: improve token type */               \
+  /* Have "interpolated" tokens (whatever that means)? identifer, path, type,  \
+   * pattern, */                                                               \
+  /* expression, statement, block, meta, item in mrustc (but not directly in   \
+   * lexer). */                                                                \
+  RS_TOKEN (LEFT_PAREN, "(")                                                   \
+  RS_TOKEN (RIGHT_PAREN, ")")                                                  \
+  RS_TOKEN (LEFT_CURLY, "{")                                                   \
+  RS_TOKEN (RIGHT_CURLY, "}")                                                  \
+  RS_TOKEN (LEFT_SQUARE, "[")                                                  \
+  RS_TOKEN (RIGHT_SQUARE, "]")                                                 \
+  /* Macros */                                                                 \
+  RS_TOKEN (DOLLAR_SIGN, "$")                                                  \
+  /* Doc Comments */                                                           \
+  RS_TOKEN (INNER_DOC_COMMENT, "#![doc]")                                      \
+  RS_TOKEN (OUTER_DOC_COMMENT, "#[doc]")                                       \
+  /* have "weak" union and 'static keywords? */                                \
+  RS_TOKEN_KEYWORD (ABSTRACT, "abstract") /* unused */                         \
+  RS_TOKEN_KEYWORD (AS, "as")                                                  \
+  RS_TOKEN_KEYWORD (ASYNC, "async")   /* unused */                             \
+  RS_TOKEN_KEYWORD (BECOME, "become") /* unused */                             \
+  RS_TOKEN_KEYWORD (BOX, "box")	      /* unused */                             \
+  RS_TOKEN_KEYWORD (BREAK, "break")                                            \
+  RS_TOKEN_KEYWORD (CONST, "const")                                            \
+  RS_TOKEN_KEYWORD (CONTINUE, "continue")                                      \
+  RS_TOKEN_KEYWORD (CRATE, "crate")                                            \
+  /* FIXME: Do we need to add $crate (DOLLAR_CRATE) as a reserved kw? */       \
+  RS_TOKEN_KEYWORD (DO, "do") /* unused */                                     \
+  RS_TOKEN_KEYWORD (DYN, "dyn")                                                \
+  RS_TOKEN_KEYWORD (ELSE, "else")                                              \
+  RS_TOKEN_KEYWORD (ENUM_TOK, "enum")                                          \
+  RS_TOKEN_KEYWORD (EXTERN_TOK, "extern")                                      \
+  RS_TOKEN_KEYWORD (FALSE_LITERAL, "false")                                    \
+  RS_TOKEN_KEYWORD (FINAL_TOK, "final") /* unused */                           \
+  RS_TOKEN_KEYWORD (FN_TOK, "fn")                                              \
+  RS_TOKEN_KEYWORD (FOR, "for")                                                \
+  RS_TOKEN_KEYWORD (IF, "if")                                                  \
+  RS_TOKEN_KEYWORD (IMPL, "impl")                                              \
+  RS_TOKEN_KEYWORD (IN, "in")                                                  \
+  RS_TOKEN_KEYWORD (LET, "let")                                                \
+  RS_TOKEN_KEYWORD (LOOP, "loop")                                              \
+  RS_TOKEN_KEYWORD (MACRO, "macro") /* unused */                               \
+  RS_TOKEN_KEYWORD (MATCH_TOK, "match")                                        \
+  RS_TOKEN_KEYWORD (MOD, "mod")                                                \
+  RS_TOKEN_KEYWORD (MOVE, "move")                                              \
+  RS_TOKEN_KEYWORD (MUT, "mut")                                                \
+  RS_TOKEN_KEYWORD (OVERRIDE_TOK, "override") /* unused */                     \
+  RS_TOKEN_KEYWORD (PRIV, "priv")	      /* unused */                     \
+  RS_TOKEN_KEYWORD (PUB, "pub")                                                \
+  RS_TOKEN_KEYWORD (REF, "ref")                                                \
+  RS_TOKEN_KEYWORD (RETURN_TOK, "return")                                      \
+  RS_TOKEN_KEYWORD (SELF_ALIAS,                                                \
+		    "Self") /* mrustc does not treat this as a reserved word*/ \
+  RS_TOKEN_KEYWORD (SELF, "self")                                              \
+  RS_TOKEN_KEYWORD (STATIC_TOK, "static")                                      \
+  RS_TOKEN_KEYWORD (STRUCT_TOK, "struct")                                      \
+  RS_TOKEN_KEYWORD (SUPER, "super")                                            \
+  RS_TOKEN_KEYWORD (TRAIT, "trait")                                            \
+  RS_TOKEN_KEYWORD (TRUE_LITERAL, "true")                                      \
+  RS_TOKEN_KEYWORD (TRY, "try") /* unused */                                   \
+  RS_TOKEN_KEYWORD (TYPE, "type")                                              \
+  RS_TOKEN_KEYWORD (TYPEOF, "typeof") /* unused */                             \
+  RS_TOKEN_KEYWORD (UNSAFE, "unsafe")                                          \
+  RS_TOKEN_KEYWORD (UNSIZED, "unsized") /* unused */                           \
+  RS_TOKEN_KEYWORD (USE, "use")                                                \
+  RS_TOKEN_KEYWORD (VIRTUAL, "virtual") /* unused */                           \
+  RS_TOKEN_KEYWORD (WHERE, "where")                                            \
+  RS_TOKEN_KEYWORD (WHILE, "while")                                            \
+  RS_TOKEN_KEYWORD (YIELD, "yield") /* unused */                               \
+  RS_TOKEN (LAST_TOKEN, "<last-token-marker>")
+
+// Contains all token types. Crappy implementation via x-macros.
+enum TokenId
+{
+#define RS_TOKEN(name, _) name,
+#define RS_TOKEN_KEYWORD(x, y) RS_TOKEN (x, y)
+  RS_TOKEN_LIST
+#undef RS_TOKEN_KEYWORD
+#undef RS_TOKEN
+};
+
+// dodgy "TokenPtr" declaration with Token forward declaration
+class Token;
+// A smart pointer (shared_ptr) to Token.
+typedef std::shared_ptr<Token> TokenPtr;
+// A smart pointer (shared_ptr) to a constant Token.
+typedef std::shared_ptr<const Token> const_TokenPtr;
+
+// Hackily defined way to get token description for enum value using x-macros
+const char *
+get_token_description (TokenId id);
+/* Hackily defined way to get token description as a string for enum value using
+ * x-macros */
+const char *
+token_id_to_str (TokenId id);
+// Get type hint description as a string.
+const char *
+get_type_hint_string (PrimitiveCoreType type);
+
+// Represents a single token. Create using factory static methods.
+class Token
+{
+private:
+  // Token kind.
+  TokenId token_id;
+  // Token location.
+  Location locus;
+  // Associated text (if any) of token.
+  std::unique_ptr<std::string> str;
+  // TODO: maybe remove issues and just store std::string as value?
+  /* Type hint for token based on lexer data (e.g. type suffix). Does not exist
+   * for most tokens. */
+  PrimitiveCoreType type_hint;
+
+  // Token constructor from token id and location. Has a null string.
+  Token (TokenId token_id, Location location)
+    : token_id (token_id), locus (location), str (nullptr),
+      type_hint (CORETYPE_UNKNOWN)
+  {}
+
+  // Token constructor from token id, location, and a string.
+  Token (TokenId token_id, Location location, std::string &&paramStr)
+    : token_id (token_id), locus (location),
+      str (new std::string (std::move (paramStr))), type_hint (CORETYPE_UNKNOWN)
+  {}
+
+  // Token constructor from token id, location, and a char.
+  Token (TokenId token_id, Location location, char paramChar)
+    : token_id (token_id), locus (location),
+      str (new std::string (1, paramChar)), type_hint (CORETYPE_UNKNOWN)
+  {}
+
+  // Token constructor from token id, location, and a "codepoint".
+  Token (TokenId token_id, Location location, Codepoint paramCodepoint)
+    : token_id (token_id), locus (location),
+      str (new std::string (paramCodepoint.as_string ())),
+      type_hint (CORETYPE_UNKNOWN)
+  {}
+
+  // Token constructor from token id, location, a string, and type hint.
+  Token (TokenId token_id, Location location, std::string &&paramStr,
+	 PrimitiveCoreType parType)
+    : token_id (token_id), locus (location),
+      str (new std::string (std::move (paramStr))), type_hint (parType)
+  {}
+
+public:
+  // No default constructor.
+  Token () = delete;
+  // Do not copy/assign tokens.
+  Token (const Token &) = delete;
+  Token &operator= (const Token &) = delete;
+
+  // Allow moving tokens.
+  Token (Token &&other) = default;
+  Token &operator= (Token &&other) = default;
+
+  ~Token () = default;
+
+  /* TODO: make_shared (which saves a heap allocation) does not work with the
+   * private constructor */
+
+  // Makes and returns a new TokenPtr (with null string).
+  static TokenPtr make (TokenId token_id, Location locus)
+  {
+    // return std::make_shared<Token> (token_id, locus);
+    return TokenPtr (new Token (token_id, locus));
+  }
+
+  // Makes and returns a new TokenPtr of type IDENTIFIER.
+  static TokenPtr make_identifier (Location locus, std::string &&str)
+  {
+    // return std::make_shared<Token> (IDENTIFIER, locus, str);
+    return TokenPtr (new Token (IDENTIFIER, locus, std::move (str)));
+  }
+
+  // Makes and returns a new TokenPtr of type INT_LITERAL.
+  static TokenPtr make_int (Location locus, std::string &&str,
+			    PrimitiveCoreType type_hint = CORETYPE_UNKNOWN)
+  {
+    // return std::make_shared<Token> (INT_LITERAL, locus, str, type_hint);
+    return TokenPtr (
+      new Token (INT_LITERAL, locus, std::move (str), type_hint));
+  }
+
+  // Makes and returns a new TokenPtr of type FLOAT_LITERAL.
+  static TokenPtr make_float (Location locus, std::string &&str,
+			      PrimitiveCoreType type_hint = CORETYPE_UNKNOWN)
+  {
+    // return std::make_shared<Token> (FLOAT_LITERAL, locus, str, type_hint);
+    return TokenPtr (
+      new Token (FLOAT_LITERAL, locus, std::move (str), type_hint));
+  }
+
+  // Makes and returns a new TokenPtr of type STRING_LITERAL.
+  static TokenPtr make_string (Location locus, std::string &&str)
+  {
+    // return std::make_shared<Token> (STRING_LITERAL, locus, str,
+    // CORETYPE_STR);
+    return TokenPtr (
+      new Token (STRING_LITERAL, locus, std::move (str), CORETYPE_STR));
+  }
+
+  // Makes and returns a new TokenPtr of type CHAR_LITERAL.
+  static TokenPtr make_char (Location locus, Codepoint char_lit)
+  {
+    // return std::make_shared<Token> (CHAR_LITERAL, locus, char_lit);
+    return TokenPtr (new Token (CHAR_LITERAL, locus, char_lit));
+  }
+
+  // Makes and returns a new TokenPtr of type BYTE_CHAR_LITERAL.
+  static TokenPtr make_byte_char (Location locus, char byte_char)
+  {
+    // return std::make_shared<Token> (BYTE_CHAR_LITERAL, locus, byte_char);
+    return TokenPtr (new Token (BYTE_CHAR_LITERAL, locus, byte_char));
+  }
+
+  // Makes and returns a new TokenPtr of type BYTE_STRING_LITERAL (fix).
+  static TokenPtr make_byte_string (Location locus, std::string &&str)
+  {
+    // return std::make_shared<Token> (BYTE_STRING_LITERAL, locus, str);
+    return TokenPtr (new Token (BYTE_STRING_LITERAL, locus, std::move (str)));
+  }
+
+  // Makes and returns a new TokenPtr of type INNER_DOC_COMMENT.
+  static TokenPtr make_inner_doc_comment (Location locus, std::string &&str)
+  {
+    return TokenPtr (new Token (INNER_DOC_COMMENT, locus, std::move (str)));
+  }
+
+  // Makes and returns a new TokenPtr of type OUTER_DOC_COMMENT.
+  static TokenPtr make_outer_doc_comment (Location locus, std::string &&str)
+  {
+    return TokenPtr (new Token (OUTER_DOC_COMMENT, locus, std::move (str)));
+  }
+
+  // Makes and returns a new TokenPtr of type LIFETIME.
+  static TokenPtr make_lifetime (Location locus, std::string &&str)
+  {
+    // return std::make_shared<Token> (LIFETIME, locus, str);
+    return TokenPtr (new Token (LIFETIME, locus, std::move (str)));
+  }
+
+  // Gets id of the token.
+  TokenId get_id () const { return token_id; }
+
+  // Gets location of the token.
+  Location get_locus () const { return locus; }
+
+  // Gets string description of the token.
+  const std::string &
+  get_str () const; /*{
+// FIXME: put in header again when fix null problem
+//gcc_assert(str != nullptr);
+if (str == nullptr) {
+error_at(get_locus(), "attempted to get string for '%s', which has no string.
+returning empty string instead.", get_token_description()); return "";
+}
+return *str;
+}*/
+
+  // Gets token's type hint info.
+  PrimitiveCoreType get_type_hint () const
+  {
+    return type_hint == CORETYPE_PURE_DECIMAL ? CORETYPE_UNKNOWN : type_hint;
+  }
+
+  // diagnostics (error reporting)
+  const char *get_token_description () const
+  {
+    return Rust::get_token_description (token_id);
+  }
+
+  // debugging
+  const char *token_id_to_str () const
+  {
+    return Rust::token_id_to_str (token_id);
+  }
+
+  // debugging
+  const char *get_type_hint_str () const;
+
+  /* Returns whether the token is a literal of any type (int, float, char,
+   * string, byte char, byte string). */
+  bool is_literal () const
+  {
+    switch (token_id)
+      {
+      case INT_LITERAL:
+      case FLOAT_LITERAL:
+      case CHAR_LITERAL:
+      case STRING_LITERAL:
+      case BYTE_CHAR_LITERAL:
+      case BYTE_STRING_LITERAL:
+	return true;
+      default:
+	return false;
+      }
+  }
+
+  /* Returns whether the token actually has a string (regardless of whether it
+   * should or not). */
+  bool has_str () const { return str != nullptr; }
+
+  // Returns whether the token should have a string.
+  bool should_have_str () const
+  {
+    return is_literal () || token_id == IDENTIFIER || token_id == LIFETIME;
+  }
+
+  // Returns whether the token is a pure decimal int literal
+  bool is_pure_decimal () const { return type_hint == CORETYPE_PURE_DECIMAL; }
+};
+} // namespace Rust
+
+#endif
--- a/gcc/rust/rust-buffered-queue.h
+++ b/gcc/rust/rust-buffered-queue.h
@@ -0,0 +1,204 @@
+// Copyright (C) 2020-2022 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef RUST_BUFFERED_QUEUE_H
+#define RUST_BUFFERED_QUEUE_H
+
+#include "rust-system.h"
+
+namespace Rust {
+/* Buffered queue implementation. Items are of type T, queue source is of type
+ * Source. Note that this is owning of the source. */
+template <typename T, typename Source> class buffered_queue
+{
+public:
+  // Construct empty queue from Source src.
+  buffered_queue (Source src) : source (src), start (0), end (0), buffer () {}
+
+  /* disable copying (since source is probably non-copyable)
+   * TODO is this actually a good idea? If source is non-copyable, it would
+   * just delete the copy constructor anyway.*/
+  buffered_queue (const buffered_queue &other) = delete;
+  buffered_queue &operator= (const buffered_queue &other) = delete;
+
+  // enable moving
+  buffered_queue (buffered_queue &&other) = default;
+  buffered_queue &operator= (buffered_queue &&other) = default;
+
+  // Returns token at position start + n (i.e. n tokens ahead).
+  T peek (int n)
+  {
+    // n should not be behind
+    rust_assert (n >= 0);
+
+    int num_queued_items = end - start;
+    int num_items_required = n + 1;
+
+    // if required items go past end of queue, add them to queue
+    if (num_items_required > num_queued_items)
+      {
+	int num_items_to_read = num_items_required - num_queued_items;
+
+	/* if queue length + extra items is larger than buffer size, expand
+	 * buffer */
+	if (end + num_items_to_read > (int) buffer.size ())
+	  {
+	    // Resize the buffer by 1.5x
+	    int new_size = (buffer.size () + num_items_to_read);
+	    new_size += (new_size >> 1);
+
+	    // old method:
+	    /*
+		  // create new queue buffer with new size
+		  std::vector<T> new_queue (new_size);
+		  std::copy (buffer.begin () + start, buffer.begin () + end,
+			     new_queue.begin ());
+		  start = 0;
+		  end = num_queued_items;
+		  // TODO: would move be better here? optimisation for move with
+		  // shared pointer?
+
+		  // swap member buffer and new queue buffer
+		  std::swap (buffer, new_queue);
+	    */
+
+	    // TODO: determine overhead of this approach vs copy. Should be
+	    // lower.
+	    std::vector<T> new_queue;
+	    new_queue.reserve (new_size);
+	    new_queue.insert (new_queue.begin (),
+			      std::make_move_iterator (buffer.begin () + start),
+			      std::make_move_iterator (buffer.begin () + end));
+	    start = 0;
+	    end = num_queued_items;
+	    // fill up rest of vector with junk so that indexing can work
+	    new_queue.insert (new_queue.begin () + end,
+			      new_size - new_queue.size (), T ());
+
+	    buffer = std::move (new_queue);
+	    /* this should be best method - std::move(range) would have
+	     * allocation problems; initial construction would require
+	     * reallocation upon resizing */
+
+	    // validate that buffer is large enough now
+	    rust_assert (end + num_items_to_read <= (int) buffer.size ());
+	  }
+
+	/* iterate through buffer and invoke operator () on source on values
+	 * past original end */
+	for (int i = 0; i < num_items_to_read; i++)
+	  buffer[end + i] = source.next ();
+
+	// move end based on additional items added
+	end += num_items_to_read;
+      }
+
+    rust_assert (0 <= start);
+    rust_assert (start <= end);
+    rust_assert (end <= (int) buffer.size ());
+
+    rust_assert (start + n < end);
+
+    // return value at start + n in buffer
+    return buffer[start + n];
+  }
+
+  /* TODO: add faster peek current token to remove overhead of conditional
+   * branches? */
+
+  // Advances start by n + 1.
+  void skip (int n)
+  {
+    // Call peek to ensure requested n is actually in queue.
+    peek (n);
+
+    // Clear queue values from start to n (inclusive).
+    for (int i = 0; i < (n + 1); i++)
+      buffer[start + i] = T ();
+
+    // Move start forward by n + 1.
+    start += (n + 1);
+
+    // Ensure start is not impossible somehow
+    rust_assert (0 <= start);
+    rust_assert (start <= end);
+
+    // Compact buffer if empty
+    if (start == end)
+      start = end = 0;
+  }
+
+  /* Inserts element at front of vector. Really dirty hack with terrible
+   * performance, only use when really needed. */
+  void insert_at_front (T elem_to_insert)
+  {
+    // TODO: test as this may not work properly
+
+    // Insert actual element in buffer at start.
+    buffer.insert (buffer.begin (), elem_to_insert);
+
+    /* Increase the end number since added element means all others have shifted
+     * one along */
+    end++;
+  }
+
+  // Insert at arbitrary position (attempt)
+  void insert (int index, T elem_to_insert)
+  {
+    // TODO: test as this may not work properly
+
+    // n should not be behind
+    rust_assert (index >= 0);
+
+    // call peek to ensure that the items behind this (at least) are in queue
+    if (index >= 1)
+      peek (index - 1);
+    else
+      peek (index);
+
+    buffer.insert (buffer.begin () + start + index, std::move (elem_to_insert));
+
+    end++;
+  }
+
+  // Replaces the current value in the buffer. Total HACK.
+  void replace_current_value (T replacement)
+  {
+    // call peek to ensure value exists
+    peek (0);
+
+    buffer[start] = std::move (replacement);
+
+    // don't move start or end
+  }
+
+private:
+  // Source of tokens for queue.
+  Source source;
+
+  // Begin of range in buffer, inclusive.
+  int start;
+  // End of range in buffer, exclusive.
+  int end;
+
+  // Queue buffer.
+  std::vector<T> buffer;
+};
+} // namespace Rust
+
+#endif