Empirical
ConfigLexer.h
Go to the documentation of this file.
1 // This file is part of Empirical, https://github.com/devosoft/Empirical
2 // Copyright (C) Michigan State University, 2016-2018.
3 // Released under the MIT Software license; see doc/LICENSE
4 //
5 // A simple lexer for the Empirical configuration language.
6 //
7 //
8 // Development notes: Initially building the lexer to be language specific, but a
9 // more general lexer will be a good addition to Empirical.
10 
11 #ifndef EMP_CONFIG_LEXER_H
12 #define EMP_CONFIG_LEXER_H
13 
14 #include <iostream>
15 #include <map>
16 #include <string>
17 
18 #include "../base/errors.h"
19 
20 #include "Token.h"
21 
22 namespace emp {
23 
24  class ConfigLexer {
25  private:
26  std::map<std::string, emp::Token> command_map;
27 
28  std::istream & is;
29  char next_char;
30  std::string cur_lexeme;
31 
32  struct Pattern {
33  std::string name; // Unique name for this token.
34  std::string pattern; // Regular expression used to identify instances of token.
35  int token_id; // Unique ID for this token.
36  bool ignore; // Should we skip over these tokens when looking for the next one?
37  };
38 
39  std::map<std::string, Pattern> patterns;
40  int next_token_id;
41 
42  public:
43  ConfigLexer(std::istream & in_stream) : is(in_stream), next_token_id(256) {
44  command_map["print"] = Token(Token::COMMAND_PRINT);
45  command_map["include"] = Token(Token::COMMAND_INCLUDE);
46  command_map["if"] = Token(Token::COMMAND_IF);
47  command_map["else"] = Token(Token::COMMAND_ELSE);
48  command_map["while"] = Token(Token::COMMAND_WHILE);
49  command_map["break"] = Token(Token::COMMAND_BREAK);
50  command_map["continue"] = Token(Token::COMMAND_CONTINUE);
51  command_map["return"] = Token(Token::COMMAND_RETURN);
52  command_map["function"] = Token(Token::COMMAND_FUNCTION);
53  command_map["foreach"] = Token(Token::COMMAND_FOREACH);
54  command_map["random"] = Token(Token::FUN_RANDOM);
55 
56  // Prime the first character so it's ready to go.
57  is.get(next_char);
58 
60  }
61  ConfigLexer(ConfigLexer &) = delete;
62  ~ConfigLexer() { ; }
63 
64  int AddPattern(const std::string & name, const std::string & pattern, int id=0, bool ignore=false) {
65  if (patterns.find(name) != patterns.end()) {
66  emp::LibraryWarning("Attempting to add multiple lexer patterns for '", name, "'. Ignoring.");
67  return -1;
68  }
69  if (!id) id = next_token_id++;
70  else if (id >= next_token_id) next_token_id = id+1;
71 
72  patterns[name] = { name, pattern, id, ignore };
73  return id;
74  }
75 
77  AddPattern("WHITESPACE", "[ \t\r]", Token::WHITESPACE, true);
78  AddPattern("COMMENT", "#.*", Token::COMMENT, true);
79  AddPattern("INT_LIT", "[0-9]+", Token::INT_LIT);
80  AddPattern("FLOAT_LIT", "[0-9]+[.]'[0-9]+", Token::FLOAT_LIT);
81  AddPattern("CHAR_LIT", "'(.|(\\\\[\\\\'nt]))'", Token::CHAR_LIT);
82 
83  AddPattern("STRING_LIT", "[\"](\\\\[nt\"\\]|[^\\\"])*\"", Token::STRING_LIT);
84  AddPattern("ID", "[a-zA-Z0-9_]+", Token::ID);
85 
86  // The rest are completely determined by their type.
87  AddPattern("ENDLINE", "[\n;]", Token::ENDLINE);
88  AddPattern("CASSIGN_ADD", "\"+=\"", Token::CASSIGN_ADD);
89  AddPattern("CASSIGN_SUB", "\"-=\"", Token::CASSIGN_SUB);
90  AddPattern("CASSIGN_MULT", "\"*=\"", Token::CASSIGN_MULT);
91  AddPattern("CASSIGN_DIV", "\"/=\"", Token::CASSIGN_DIV);
92  AddPattern("CASSIGN_MOD", "\"%=\"", Token::CASSIGN_MOD);
93  AddPattern("COMP_EQU", "==", Token::COMP_EQU);
94  AddPattern("COMP_NEQU", "!=", Token::COMP_NEQU);
95  AddPattern("COMP_LESS", "<", Token::COMP_LESS);
96  AddPattern("COMP_LTE", "<=", Token::COMP_LTE);
97  AddPattern("COMP_GTR", ">", Token::COMP_GTR);
98  AddPattern("COMP_GTE", ">=", Token::COMP_GTE);
99  AddPattern("BOOL_AND", "&&", Token::BOOL_AND);
100  AddPattern("BOOL_OR", "||", Token::BOOL_OR);
101 
102  // Statement commands
103  AddPattern("COMMAND_PRINT", "print", Token::COMMAND_PRINT);
104  AddPattern("COMMAND_INCLUDE", "include", Token::COMMAND_INCLUDE);
105  AddPattern("COMMAND_IF", "if", Token::COMMAND_IF);
106  AddPattern("COMMAND_ELSE", "else", Token::COMMAND_ELSE);
107  AddPattern("COMMAND_WHILE", "while", Token::COMMAND_WHILE);
108  AddPattern("COMMAND_FOREACH", "foreach", Token::COMMAND_FOREACH);
109  AddPattern("COMMAND_BREAK", "break", Token::COMMAND_BREAK);
110  AddPattern("COMMAND_CONTINUE", "continue", Token::COMMAND_CONTINUE);
111  AddPattern("COMMAND_FUNCTION", "function", Token::COMMAND_FUNCTION);
112  AddPattern("COMMAND_RETURN", "return", Token::COMMAND_RETURN);
113 
114  // Built-in functions.
115  AddPattern("FUN_RANDOM", "random", Token::FUN_RANDOM);
116  }
117 
118  int GetMaxToken() const { return next_token_id; }
119 
120  // @CAO For the moment, GetToken is pre-build using the default token set.
122  while (next_char > 0) { // Keep looping until we find a token or hit EOF.
123  if (is_digit(next_char)) { // Must be a number
124  cur_lexeme.resize(1);
125  cur_lexeme[0] = next_char;
126  while (is_digit(next_char = is.get())) {
127  cur_lexeme.push_back(next_char);
128  }
129  if (next_char == '.') { // Must be floating point.
130  cur_lexeme.push_back('.');
131  while (is_digit(next_char = is.get())) {
132  cur_lexeme.push_back(next_char);
133  }
134  return Token(Token::FLOAT_LIT, cur_lexeme);
135  }
136  return Token(Token::INT_LIT, cur_lexeme);
137  }
138  if (is_idchar(next_char)) { // Must be ID or Keyword (number already captured)
139  cur_lexeme.resize(1);
140  cur_lexeme[0] = next_char;
141  while (is_idchar(next_char = is.get())) {
142  cur_lexeme.push_back(next_char);
143  }
144 
145  // Determine if the current lexeme is a command.
146  auto map_ptr = command_map.find(cur_lexeme);
147  if (map_ptr != command_map.end()) {
148  return map_ptr->second;
149  }
150 
151  return Token(Token::ID, cur_lexeme);
152  }
153 
154  // Any remaining possibilities start with a specific chatacter (or small set of options).
155  // Advance next_char now for look-ahead.
156  char prev_char = next_char;
157  next_char = is.get();
158 
159  switch (prev_char) {
160  case '#': // Comment. Delete to end of line.
161  while (next_char != '\n') next_char = is.get();
162  next_char = is.get();
163  break; // Comments don't return a token.
164  case '\"':
165  cur_lexeme.resize(0);
166  next_char = is.get(); // Get first char of string.
167  while (next_char != '\"') { cur_lexeme.push_back(next_char); next_char = is.get(); }
168  next_char = is.get();
169  return Token(Token::STRING_LIT, cur_lexeme);
170  case ' ':
171  case '\t':
172  case '\r':
173  break; // Skip WS; don't return token.
174  case '\n':
175  case ';':
176  return Token(Token::ENDLINE);
177  case '+':
178  if (next_char == '=') { next_char = is.get(); return Token(Token::CASSIGN_ADD); }
179  return Token('+');
180  case '-':
181  if (next_char == '=') { next_char = is.get(); return Token(Token::CASSIGN_SUB); }
182  return Token('-');
183  case '*':
184  if (next_char == '=') { next_char = is.get(); return Token(Token::CASSIGN_MULT); }
185  return Token('*');
186  case '/':
187  if (next_char == '=') { next_char = is.get(); return Token(Token::CASSIGN_DIV); }
188  return Token('/');
189  case '%':
190  if (next_char == '=') { next_char = is.get(); return Token(Token::CASSIGN_MOD); }
191  return Token('%');
192  case '=':
193  if (next_char == '=') { next_char = is.get(); return Token(Token::COMP_EQU); }
194  return Token('=');
195  case '<':
196  if (next_char == '=') { next_char = is.get(); return Token(Token::COMP_LTE); }
197  return Token(Token::COMP_LESS);
198  case '>':
199  if (next_char == '=') { next_char = is.get(); return Token(Token::COMP_GTE); }
200  return Token(Token::COMP_GTR);
201  case '!':
202  if (next_char == '=') { next_char = is.get(); return Token(Token::COMP_NEQU); }
203  return Token('!');
204  case '&':
205  if (next_char == '&') { next_char = is.get(); return Token(Token::BOOL_AND); }
206  return Token('&');
207  case '|':
208  if (next_char == '|') { next_char = is.get(); return Token(Token::BOOL_OR); }
209  return Token('|');
210  case '\'': // Char literal.
211  if (next_char == '\\') {
212  next_char = is.get();
213  switch (next_char) {
214  case 'n': next_char = '\n'; break;
215  case 'r': next_char = '\r'; break;
216  case 't': next_char = '\t'; break;
217  case '\\': next_char = '\\'; break;
218  case '\'': next_char = '\''; break;
219  case '\"': next_char = '\"'; break;
220  }
221  }
222  prev_char = next_char; // prev_char is now contents.
223  next_char = is.get();
224  if (next_char != '\'') return Token(Token::ERROR);
225  next_char = is.get();
226  return Token(Token::CHAR_LIT, std::string(1,prev_char));
227  case '(':
228  case ')':
229  case ',':
230  case '{':
231  case '}':
232  case '[':
233  case ']':
234  case '.':
235  case ':':
236  case '?':
237  return Token(prev_char);
238  default:
239  // std::cout << "[[ Unk_char=" << int(unk_char) << " ]]" << std::endl;
240  return Token(Token::UNKNOWN, std::string(1, prev_char));
241  }
242  }
243 
244  return Token(Token::NONE);
245  }
246  };
247 
248 }
249 
250 #endif
~ConfigLexer()
Definition: ConfigLexer.h:62
int GetMaxToken() const
Definition: ConfigLexer.h:118
Unknown modifier; will trigger error.
bool is_idchar(char test_char)
Determine if a character is a letter, digit, or underscore.
Definition: string_utils.h:195
Information about a token instance from an input stream.
Definition: Lexer.h:48
ID
Definition: struct.h:26
emp::Token GetToken()
Definition: ConfigLexer.h:121
void AddDefaultPatterns()
Definition: ConfigLexer.h:76
bool is_digit(char test_char)
Determine if a character is a digit.
Definition: string_utils.h:185
If we are in emscripten, make sure to include the header.
Definition: array.h:37
Definition: ConfigLexer.h:24
int AddPattern(const std::string &name, const std::string &pattern, int id=0, bool ignore=false)
Definition: ConfigLexer.h:64
ConfigLexer(std::istream &in_stream)
Definition: ConfigLexer.h:43
void LibraryWarning(Ts &&...msg)
Library user has made an error in how they are using the library.
Definition: errors.h:149