17 #include "../base/vector.h" 32 TokenInfo(
const std::string & n,
const std::string & r,
size_t i,
bool s=
false)
33 : name(n), regex(r), id(i), save_lexeme(s) { ; }
38 void Print(std::ostream & os=std::cout)
const {
42 <<
" save_lexeme:" << save_lexeme
52 Token(
size_t id,
const std::string & str=
"") : token_id(id), lexeme(str) { ; }
57 operator size_t() {
return token_id; }
60 operator const std::string &() {
return lexeme; }
68 mutable bool generate_lexer;
69 mutable DFA lexer_dfa;
73 static const size_t MAX_TOKEN_ID = 256;
74 static const size_t ERROR_ID = MAX_TOKEN_ID;
75 static inline bool TokenOK(
size_t id) {
return id < MAX_TOKEN_ID; }
78 : token_set(), cur_token_id(MAX_TOKEN_ID), generate_lexer(false), lexer_dfa(), lexeme() { }
85 size_t AddToken(
const std::string & in_name,
const std::string & in_regex) {
87 generate_lexer =
true;
88 token_set.
emplace_back( in_name, in_regex, cur_token_id );
93 constexpr
static size_t MaxTokenID() {
return MAX_TOKEN_ID; }
97 for (
const auto & t : token_set) {
98 if (t.name == name)
return t.id;
105 if (
id >= MAX_TOKEN_ID)
return "Error";
106 if (
id == 0)
return "EOF";
108 for (
const auto & t : token_set) {
109 if (t.id ==
id)
return t.name;
116 for (
const auto & t : token_set) {
117 if (t.name == name)
return t;
125 for (
const auto & t : token_set) {
128 generate_lexer =
false;
129 lexer_dfa =
to_DFA(lexer_nfa);
134 if (generate_lexer) Generate();
146 while (cur_stop >= 0 && cur_state >= 0 && is) {
147 const char next_char = (char) is.get();
149 cur_state = lexer_dfa.
Next(cur_state, (
size_t) next_char);
150 cur_stop = lexer_dfa.
GetStop(cur_state);
151 if (cur_stop > 0) { best_pos = cur_pos; best_stop = cur_stop; }
152 lexeme.push_back( next_char );
156 if (best_pos > 0 && best_pos < cur_pos) {
157 lexeme.resize(best_pos);
158 while (best_pos < cur_pos) { is.unget(); cur_pos--; }
163 if (!is)
return { 0,
"" };
164 return { ERROR_ID, lexeme };
167 return { (size_t) best_stop, lexeme };
172 std::stringstream ss;
174 auto out_val = Process(ss);
183 void Print(std::ostream & os=std::cout)
const {
184 for (
const auto & t : token_set) t.Print(os);
185 if (generate_lexer) Generate();
static const NFA & to_NFA(const NFA &nfa)
Converting NFA to MFA – no change needed.
Definition: lexer_utils.h:30
TokenInfo & operator=(const TokenInfo &)=default
int Next(int state, size_t sym) const
Return the new state after a symbol occurs.
Definition: DFA.h:90
Token(size_t id, const std::string &str="")
Definition: Lexer.h:52
A basic regular expression handler.
Definition: RegEx.h:55
size_t GetNumTokens() const
How many types of tokens can be identified in this Lexer?
Definition: Lexer.h:82
A dynamic NFA class, for easily building non-determanistic finite automata.
Definition: NFA.h:37
void Merge(const tNFA< NUM_SYMBOLS, STOP_TYPE > &nfa2)
Merge another NFA into this one.
Definition: NFA.h:189
static bool TokenOK(size_t id)
Definition: Lexer.h:75
size_t token_id
Which type of token is this?
Definition: Lexer.h:49
void Print(std::ostream &os=std::cout)
Print details about this DFA.
Definition: DFA.h:109
TokenInfo(const std::string &n, const std::string &r, size_t i, bool s=false)
Definition: Lexer.h:32
A lexer with a set of token types (and associated regular expressions)
Definition: Lexer.h:64
std::string AsString() const
Convert the RegEx to an standard string, readable from outsite this class.
Definition: RegEx.h:482
Lexer()
Definition: Lexer.h:77
static constexpr size_t MaxTokenID()
How many total token types are allowed in this lexer?
Definition: Lexer.h:93
size_t size() const
Definition: vector.h:151
static const DFA & to_DFA(const DFA &dfa)
Converting DFA to DFA – no change needed.
Definition: lexer_utils.h:27
Token Process(std::istream &is)
Get the next token found in an input stream.
Definition: Lexer.h:133
void emplace_back(ARGS &&...args)
Definition: vector.h:219
Information about a token instance from an input stream.
Definition: Lexer.h:48
RegEx regex
Pattern to describe token type.
Definition: Lexer.h:27
const std::string & GetLexeme()
Get the lexeme associated with the last token identified.
Definition: Lexer.h:180
TokenInfo GetTokenInfo(const std::string &name) const
Get the full information about a token (you provide the name)
Definition: Lexer.h:115
~Lexer()
Definition: Lexer.h:79
static const PrintStr endl("<br>")
Pre-define emp::endl to insert a "<br>" and thus acting like a newline.
std::string lexeme
The specific sequence matched by this token.
Definition: Lexer.h:50
Information about an individual token type to be processed within a Lexer.
Definition: Lexer.h:25
void Print(std::ostream &os=std::cout) const
Print the full information about this lexer (for debugging)
Definition: Lexer.h:183
static std::string to_escaped_string(char value)
Convert a single chararcter to one that uses a proper escape sequence (in a string) if needed...
Definition: string_utils.h:36
void Print(std::ostream &os=std::cout) const
Print out the status of this token (for debugging)
Definition: Lexer.h:38
void Generate() const
Create the NFA that will identify the current set of tokens in a sequence.
Definition: Lexer.h:123
If we are in emscripten, make sure to include the header.
Definition: array.h:37
std::string GetTokenName(size_t id) const
Get the name associated with a token type (you provide the ID)
Definition: Lexer.h:104
Build a debug wrapper emp::vector around std::vector.
Definition: vector.h:42
Basic regular expression handler.
std::string name
Name of this token type.
Definition: Lexer.h:26
bool save_lexeme
Should we preserve the lexeme for this token?
Definition: Lexer.h:29
A set of utilities to convert between NFAs and DFAs.
Token Process(std::string &in_str)
Shortcut to process a string rather than a stream.
Definition: Lexer.h:171
stop_t GetStop(int state) const
Get the stop value associated with a state.
Definition: DFA.h:76
size_t id
Unique id for token.
Definition: Lexer.h:28
size_t GetTokenID(const std::string &name) const
Get the ID associated with a token type (you provide the token name)
Definition: Lexer.h:96
size_t AddToken(const std::string &in_name, const std::string &in_regex)
Add a new token, specified by a name and the regex used to identify it.
Definition: Lexer.h:85