Empirical
Parser.h
Go to the documentation of this file.
1 
17 // OTHER NOTES:
18 // Setup -> and | and || operators on Parse symbol to all do the same thing: take a pattern of
19 // either string or int (or ideally mixed??) and add a new rule.
20 //
21 // parser("expression") -> { "literal_int" }
22 // | { "expression", "+", "expression"}
23 // | { "expression", "*", "expression"}
24 // | { "(", "expression", ")"}
25 
26 #ifndef EMP_PARSER_H
27 #define EMP_PARSER_H
28 
29 #include "../base/vector.h"
30 
31 #include "BitVector.h"
32 #include "Lexer.h"
33 
34 namespace emp {
35 
37  struct ParseSymbol {
38  std::string name;
40  size_t id;
41 
44  bool nullable;
45 
47  : name(), rule_ids(), id(0)
48  , first(Lexer::MaxTokenID()), follow(Lexer::MaxTokenID()), nullable(false) { ; }
49  };
50 
52  struct ParseRule {
53  size_t symbol_id;
55 
56  ParseRule(size_t sid) : symbol_id(sid), pattern() { ; }
57  };
58 
60  class Parser {
61  private:
62  Lexer & lexer;
63  emp::vector<ParseSymbol> symbols;
65  size_t cur_symbol_id;
66  int active_pos;
67 
68  void BuildRule(emp::vector<size_t> & new_pattern) { ; }
69  template <typename T, typename... EXTRAS>
70  void BuildRule(emp::vector<size_t> & new_pattern, T && arg, EXTRAS &&... extras) {
71  new_pattern.push_back( GetID((size_t) std::forward<T>(arg)) );
72  BuildRule(new_pattern, std::forward<EXTRAS>(extras)...);
73  }
74 
76  int GetSymbolPos(const std::string & name) const {
77  for (size_t i = 0; i < symbols.size(); i++) {
78  if (symbols[i].name == name) return (int) i;
79  }
80  return -1;
81  }
82 
84  int GetIDPos(size_t id) const {
85  if (id < lexer.MaxTokenID()) return -1;
86  return (int) (id - lexer.MaxTokenID());
87  }
88 
90  size_t AddSymbol(const std::string & name) {
91  ParseSymbol new_symbol;
92  new_symbol.name = name;
93  new_symbol.id = cur_symbol_id++;
94  const size_t out_pos = symbols.size();
95  symbols.emplace_back(new_symbol);
96  return out_pos;
97  }
98 
99  public:
100  Parser(Lexer & in_lexer)
101  : lexer(in_lexer), symbols(), rules()
102  , cur_symbol_id(in_lexer.MaxTokenID()), active_pos(0) { ; }
103  ~Parser() { ; }
104 
105  Lexer & GetLexer() { return lexer; }
106 
108  size_t GetID(size_t id) const { return id; }
109 
111  size_t GetID(const std::string & name) {
112  int spos = GetSymbolPos(name); // First check if parse symbol exists.
113  if (spos >= 0) return symbols[(size_t)spos].id; // ...if so, return it.
114  size_t tid = lexer.GetTokenID(name); // Otherwise, check for token name.
115  if (Lexer::TokenOK(tid)) return tid; // ...if so, return id.
116 
117  // Else, add symbol to declaration list
118  size_t new_spos = AddSymbol(name);
119  return symbols[new_spos].id;
120  }
121 
123  std::string GetName(size_t symbol_id) const {
124  if (Lexer::TokenOK(symbol_id)) return lexer.GetTokenName(symbol_id);
125  const size_t spos = symbol_id - lexer.MaxTokenID();
126  return symbols[spos].name;
127  }
128 
130  Parser & operator()(const std::string & name) {
131  active_pos = GetSymbolPos(name);
132  if (active_pos == -1) active_pos = (int) AddSymbol(name);
133  return *this;
134  }
135 
137  ParseSymbol & GetParseSymbol(const std::string & name) {
138  size_t pos = (size_t) GetSymbolPos( name );
139  return symbols[pos];
140  }
141 
143  template <typename... STATES>
144  Parser & Rule(STATES... states) {
145  emp_assert(active_pos >= 0 && active_pos < (int) symbols.size(), active_pos);
146 
147  auto rule_id = rules.size();
148  symbols[(size_t)active_pos].rule_ids.push_back(rule_id);
149  rules.emplace_back(active_pos);
150  BuildRule(rules.back().pattern, states...);
151  if (rules.back().pattern.size() == 0) symbols[(size_t)active_pos].nullable = true;
152  return *this;
153  }
154 
156  template <typename... STATES>
157  size_t AddRule(const std::string & name, STATES &&... states) {
158  const size_t id = GetID(name);
159  active_pos = GetSymbolPos(name); // @CAO We just did this, so can be faster.
160  Rule(std::forward<STATES>(states)...);
161  return id;
162  }
163 
165  void Process(std::istream & is, bool test_valid=true) {
166  // Scan through the current grammar and try to spot any problems.
167  if (test_valid) {
168  // @CAO: Any symbols with no rules?
169  // @CAO: Any inaccessible symbols?
170  // @CAO: Ideally, any shift-reduce or reduce-reduce errors? (maybe later?)
171  }
172 
173  // Determine which symbols are nullable.
174  bool progress = true;
175  while (progress) {
176  progress = false;
177  // Scan all symbols.
178  for (auto & r : rules) {
179  auto & s = symbols[r.symbol_id];
180  if (s.nullable) continue; // If a symbol is already nullable, skip it.
181 
182  // For each pattern, see if all internal symbols are nullable.
183  bool cur_nullable = true;
184  for (size_t sid : r.pattern) {
185  int pos = GetIDPos(sid);
186  if (pos < 0 || symbols[(size_t)pos].nullable == false) { cur_nullable = false; break; }
187  }
188  if (cur_nullable) { s.nullable = true; progress = true; break; }
189  }
190  }
191 
192  // Determine FIRST of each symbol.
193  // @CAO Can speed up by ignoring a rule if it can't provide new information.
194  progress = true;
195  while (progress) {
196  progress = false;
197  // @CAO Continue here.
198  }
199  }
200 
202  void Print(std::ostream & os=std::cout) const {
203  os << symbols.size() << " parser symbols available." << std::endl;
204  for (const auto & s : symbols) {
205  os << "symbol '" << s.name << "' (id " << s.id << ") has "
206  << s.rule_ids.size() << " patterns.";
207  if (s.nullable) os << " [NULLABLE]";
208  os << std::endl;
209  for (size_t rid : s.rule_ids) {
210  const emp::vector<size_t> & p = rules[rid].pattern;
211  os << " ";
212  if (p.size() == 0) os << " [empty]";
213  for (size_t x : p) os << " " << GetName(x) << "(" << x << ")";
214  os << std::endl;
215  }
216  }
217  }
218 
219  };
220 
221 }
222 
223 #endif
void Process(std::istream &is, bool test_valid=true)
Convert an input stream into a parse tree (TO FINISH!)
Definition: Parser.h:165
size_t id
What is the unique ID of this symbol?
Definition: Parser.h:40
emp::vector< size_t > pattern
The pattern that this rule is triggered by.
Definition: Parser.h:54
std::string GetName(size_t symbol_id) const
Conversion ot a sybol ID to its name.
Definition: Parser.h:123
static bool TokenOK(size_t id)
Definition: Lexer.h:75
ParseSymbol()
Definition: Parser.h:46
A drop-in replacement for std::vector<bool>, but with extra bitwise logic features.
Definition: BitVector.h:39
A drop-in replacement for std::vector<bool>, with additional bitwise logic features.
void push_back(PB_Ts &&...args)
Definition: vector.h:189
A lexer with a set of token types (and associated regular expressions)
Definition: Lexer.h:64
A rule for how parsing should work.
Definition: ConfigParser.h:21
size_t GetID(const std::string &name)
Converstion of a symbol name to its ID.
Definition: Parser.h:111
ParseRule(size_t sid)
Definition: Parser.h:56
static constexpr size_t MaxTokenID()
How many total token types are allowed in this lexer?
Definition: Lexer.h:93
size_t size() const
Definition: vector.h:151
void emplace_back(ARGS &&...args)
Definition: vector.h:219
emp::BitVector first
What tokens can begin this symbol?
Definition: Parser.h:42
Parser & operator()(const std::string &name)
Provide a symbol to the compiler and set it as active.
Definition: Parser.h:130
static const PrintStr endl("<br>")
Pre-define emp::endl to insert a "<br>" and thus acting like a newline.
A general-purpose, fast lexer.
Parser(Lexer &in_lexer)
Definition: Parser.h:100
bool nullable
Can this symbol be converted to nothing?
Definition: Parser.h:44
size_t AddRule(const std::string &name, STATES &&...states)
Specify the name of the symbol and add a rule to it, returning the symbol id.
Definition: Parser.h:157
void Print(std::ostream &os=std::cout) const
Print the current status of this parser (for debugging)
Definition: Parser.h:202
A single symbol in a grammer including the patterns that generate it.
Definition: Parser.h:37
~Parser()
Definition: Parser.h:103
If we are in emscripten, make sure to include the header.
Definition: array.h:37
std::string name
Unique name for this parse symbol.
Definition: Parser.h:38
std::string GetTokenName(size_t id) const
Get the name associated with a token type (you provide the ID)
Definition: Lexer.h:104
Lexer & GetLexer()
Definition: Parser.h:105
emp::BitVector follow
What tokens can come after this symbol?
Definition: Parser.h:43
size_t symbol_id
The ID of the symbol that this rule should simplify to.
Definition: Parser.h:53
#define emp_assert(...)
Definition: assert.h:199
T & back()
Definition: vector.h:183
size_t GetID(size_t id) const
Trivial conversions of ID to ID...
Definition: Parser.h:108
ParseSymbol & GetParseSymbol(const std::string &name)
Get the parser symbol information associated with a provided name.
Definition: Parser.h:137
Full information about a parser, including a lexer, symbols, and rules.
Definition: Parser.h:60
Parser & Rule(STATES...states)
Use the currently active symbol and attach a rule to it.
Definition: Parser.h:144
size_t GetTokenID(const std::string &name) const
Get the ID associated with a token type (you provide the token name)
Definition: Lexer.h:96
emp::vector< size_t > rule_ids
Which rules apply to this symbol?
Definition: Parser.h:39