Empirical
RegEx.h
Go to the documentation of this file.
1 
35 #ifndef EMP_REGEX_H
36 #define EMP_REGEX_H
37 
38 
39 #include <ostream>
40 #include <sstream>
41 #include <string>
42 
43 #include "../base/vector.h"
44 #include "../base/Ptr.h"
45 
46 #include "BitSet.h"
47 #include "lexer_utils.h"
48 #include "NFA.h"
49 #include "string_utils.h"
50 
51 
52 namespace emp {
53 
55  class RegEx {
56  private:
57  constexpr static size_t NUM_SYMBOLS = 128;
59  std::string regex;
61  bool valid;
62  size_t pos;
63 
64  mutable DFA dfa;
65  mutable bool dfa_ready;
66 
67  template <typename... T>
68  void Error(T &&... args) {
69  notes.push_back(emp::to_string(std::forward<T>(args)...));
70  valid = false;
71  }
72 
73  // Pre-declarations
74  struct re_block;
75  struct re_charset;
76  struct re_parent;
77  struct re_string;
78 
80  struct re_base { // Also used for empty regex
81  virtual ~re_base() { ; }
82  virtual void Print(std::ostream & os) const { os << "[]"; }
83  virtual Ptr<re_block> AsBlock() { return nullptr; }
84  virtual Ptr<re_charset> AsCharSet() { return nullptr; }
85  virtual Ptr<re_parent> AsParent() { return nullptr; }
86  virtual Ptr<re_string> AsString() { return nullptr; }
87  virtual size_t GetSize() const { return 0; }
88  virtual bool Simplify() { return false; }
89  virtual void AddToNFA(NFA & nfa, size_t start, size_t stop) const { nfa.AddFreeTransition(start, stop); }
90  };
91 
93  struct re_string : public re_base { // Series of specific chars
94  std::string str;
95  re_string() : str() { ; }
96  re_string(char c) : str() { str.push_back(c); }
97  re_string(const std::string & s) : str(s) { ; }
98  void Print(std::ostream & os) const override { os << "STR[" << to_escaped_string(str) << "]"; }
99  Ptr<re_string> AsString() override { return ToPtr(this); }
100  size_t GetSize() const override { return str.size(); }
101  virtual void AddToNFA(NFA & nfa, size_t start, size_t stop) const override {
102  size_t prev_id = start;
103  for (char x : str) {
104  size_t next_id = nfa.AddNewState();
105  nfa.AddTransition(prev_id, next_id, (size_t) x);
106  prev_id = next_id;
107  }
108  nfa.AddFreeTransition(prev_id, stop);
109  }
110  };
111 
113  struct re_charset : public re_base { // Any char from set.
114  opts_t char_set;
115  re_charset() : char_set() { ; }
116  re_charset(char x, bool neg=false) : char_set() {
117  char_set[(size_t)x]=true;
118  if (neg) char_set.NOT_SELF();
119  }
120  re_charset(const std::string & s, bool neg=false) : char_set() {
121  for (char x : s) char_set[(size_t)x]=true;
122  if (neg) char_set.NOT_SELF();
123  }
124  void Print(std::ostream & os) const override {
125  auto chars = char_set.GetOnes();
126  bool use_not = false;
127  if (chars.size() > 64) { chars = (~char_set).GetOnes(); use_not = true; }
128  os << "SET[";
129  if (use_not) os << "NOT ";
130  for (auto c : chars) os << to_escaped_string((char) c);
131  os << "]";
132  }
133  Ptr<re_charset> AsCharSet() override { return ToPtr(this); }
134  size_t GetSize() const override { return char_set.CountOnes(); }
135  char First() const { return (char) char_set.FindBit(); }
136  virtual void AddToNFA(NFA & nfa, size_t start, size_t stop) const override {
137  for (size_t i = 0; i < NUM_SYMBOLS; i++) if (char_set[i]) nfa.AddTransition(start, stop, i);
138  }
139  };
140 
142  struct re_parent : public re_base {
143  protected:
145  public:
146  re_parent() : nodes() { }
147  ~re_parent() { for (auto x : nodes) x.Delete(); }
148  void Clear() { for (auto x : nodes) x.Delete(); nodes.resize(0); }
149  virtual void push(Ptr<re_base> x) { emp_assert(x != nullptr); nodes.push_back(x); }
150  Ptr<re_base> pop() { auto out = nodes.back(); nodes.pop_back(); return out; }
151  size_t GetSize() const override { return nodes.size(); }
152  Ptr<re_parent> AsParent() override { return ToPtr(this); }
153  bool Simplify() override {
154  bool m=false;
155  for (auto & x : nodes) {
156  // Recursively simplify children.
157  m |= x->Simplify();
158  // A block with one child can be replaced by child.
159  if (x->AsBlock() && x->GetSize() == 1) {
160  auto child = x->AsParent()->nodes[0];
161  x->AsParent()->nodes.resize(0);
162  x.Delete();
163  x = child;
164  m = true;
165  }
166  }
167  return m;
168  }
169  };
170 
172  struct re_block : public re_parent { // Series of re's
173  void Print(std::ostream & os) const override {
174  os << "BLOCK["; for (auto x : nodes) x->Print(os); os << "]";
175  }
176  Ptr<re_block> AsBlock() override { return ToPtr(this); }
177  bool Simplify() override {
178  bool modify = false;
179  // Loop through block elements, simplifying when possible.
180  for (size_t i = 0; i < nodes.size(); i++) {
181  // If node is a charset with one option, replace it with a string.
182  if (nodes[i]->AsCharSet() && nodes[i]->GetSize() == 1) {
183  auto new_node = NewPtr<re_string>(nodes[i]->AsCharSet()->First());
184  nodes[i].Delete();
185  nodes[i] = new_node;
186  modify = true;
187  }
188  // If two neighboring nodes are strings, merge them.
189  if (i > 0 && nodes[i]->AsString() && nodes[i-1]->AsString()) {
190  nodes[i-1]->AsString()->str += nodes[i]->AsString()->str;
191  nodes[i].Delete();
192  nodes.erase(nodes.begin() + (int) i);
193  i--;
194  modify = true;
195  continue;
196  }
197 
198  // If blocks are nested, merge them into a single block.
199  if (nodes[i]->AsBlock()) {
200  auto old_node = nodes[i]->AsBlock(); // Save the old node for merging.
201  nodes.erase(nodes.begin() + (int) i); // Remove block from nodes.
202  nodes.insert(nodes.begin() + (int) i, old_node->nodes.begin(), old_node->nodes.end());
203  old_node->nodes.resize(0); // Don't recurse delete since nodes were moved!
204  old_node.Delete();
205  i--;
206  modify = true;
207  continue;
208  }
209  }
210 
211  // Also run the default Simplify on nodes.
212  modify |= re_parent::Simplify();
213 
214  return modify;
215  }
216  virtual void AddToNFA(NFA & nfa, size_t start, size_t stop) const override {
217  size_t prev_id = start;
218  for (auto x : nodes) {
219  size_t next_id = nfa.AddNewState();
220  x->AddToNFA(nfa, prev_id, next_id);
221  prev_id = next_id;
222  }
223  nfa.AddFreeTransition(prev_id, stop);
224  }
225  };
226 
228  struct re_or : public re_parent { // lhs -or- rhs
229  re_or(Ptr<re_base> l, Ptr<re_base> r) { push(l); push(r); }
230  void Print(std::ostream & os) const override {
231  os << "|[";
232  nodes[0]->Print(os);
233  nodes[1]->Print(os);
234  os << "]";
235  }
236  virtual void AddToNFA(NFA & nfa, size_t start, size_t stop) const override {
237  nodes[0]->AddToNFA(nfa, start, stop);
238  nodes[1]->AddToNFA(nfa, start, stop);
239  }
240  };
241 
243  struct re_star : public re_parent { // zero-or-more
244  re_star(Ptr<re_base> c) { push(c); }
245  void Print(std::ostream & os) const override { os << "*["; nodes[0]->Print(os); os << "]"; }
246 
247  virtual void AddToNFA(NFA & nfa, size_t start, size_t stop) const override {
248  const size_t target = nfa.AddNewState();
249  nodes[0]->AddToNFA(nfa, start, target);
250  nfa.AddFreeTransition(target, start);
251  nfa.AddFreeTransition(start, stop);
252  }
253  };
254 
256  struct re_plus : public re_parent { // one-or-more
257  re_plus(Ptr<re_base> c) { push(c); }
258  void Print(std::ostream & os) const override { os << "+["; nodes[0]->Print(os); os << "]"; }
259  virtual void AddToNFA(NFA & nfa, size_t start, size_t stop) const override {
260  const size_t target = nfa.AddNewState();
261  nodes[0]->AddToNFA(nfa, start, target);
262  // From the target, can either go back to start and repeat, or straight to stop.
263  nfa.AddFreeTransition(target, start);
264  nfa.AddFreeTransition(target, stop);
265  }
266  };
267 
269  struct re_qm : public re_parent { // zero-or-one
270  re_qm(Ptr<re_base> c) { push(c); }
271  void Print(std::ostream & os) const override { os << "?["; nodes[0]->Print(os); os << "]"; }
272  virtual void AddToNFA(NFA & nfa, size_t start, size_t stop) const override {
273  nodes[0]->AddToNFA(nfa, start, stop);
274  nfa.AddFreeTransition(start, stop);
275  }
276  };
277 
278  re_block head;
279 
282  bool EnsureNext(char x) {
283  if (pos >= regex.size()) Error("Expected ", x, " before end.");
284  else if (regex[pos] != x) Error("Expected ", x, " at position ", pos,
285  "; found ", regex[pos], ".");
286  ++pos; // We have what we were expecting! Move on...
287  return valid;
288  }
289 
291  Ptr<re_charset> ConstructSet() {
292  char c = regex[pos++];
293  bool neg = false;
294  if (c == '^') { neg = true; c = regex[pos++]; }
295  auto out = NewPtr<re_charset>();
296  char prev_c = -1;
297  while (c != ']' && pos < regex.size()) {
298  if (c == '-' && prev_c != -1) {
299  c = regex[pos++];
300  if (c < prev_c) { Error("Invalid character range ", prev_c, '-', c); continue; }
301  for (char x = prev_c; x <= c; x++) {
302  out->char_set[(size_t)x] = true;
303  }
304  prev_c = -1;
305  c = regex[pos++];
306  continue;
307  }
308  else if (c == '\\') {
309  c = regex[pos++]; // Identify the specific escape char.
310  switch(c) {
311  case 'n': c = '\n'; break;
312  case 'r': c = '\r'; break;
313  case 't': c = '\t'; break;
314  // Any of these characters should just be themselves!
315  case '-':
316  case '\\':
317  case ']':
318  case '[':
319  case '^':
320  break;
321  default:
322  Error("Unknown escape char for char sets: '\\", c, "'.");
323  }
324  }
325  out->char_set[(size_t)c] = true;
326  prev_c = c;
327  c = regex[pos++];
328  }
329  if (neg) out->char_set.NOT_SELF();
330  if (c == ']') --pos;
331  return out;
332  }
333 
335  Ptr<re_string> ConstructString() {
336  char c = regex[pos++];
337  auto out = NewPtr<re_string>();
338  while (c != '\"' && pos < regex.size()) {
339  // @CAO Error if we run out of chars before close '"'
340  if (c == '\\') {
341  c = regex[pos++]; // Identify the specific escape char.
342  switch(c) {
343  case 'n': c = '\n'; break;
344  case 'r': c = '\r'; break;
345  case 't': c = '\t'; break;
346  // Any of these characters should just be themselves!
347  case '\"':
348  case '\\':
349  break;
350  default:
351  Error("Unknown escape char for literal string: '\\", c, "'.");
352  }
353  }
354  out->str.push_back(c);
355  c = regex[pos++];
356  }
357  if (c == '\"') --pos;
358 
359  return out;
360  }
361 
363  Ptr<re_base> ConstructSegment() {
364  Ptr<re_base> result;
365  char c = regex[pos++]; // Grab the current character and move pos to next.
366  switch (c) {
367  case '.':
368  result = NewPtr<re_charset>('\n', true); // Anything except newline.
369  break;
370  case '(':
371  result = Process(); // Process the internal contents of parens.
372  EnsureNext(')'); // Make sure last char is a paren and advance.
373  break;
374  case '[':
375  result = ConstructSet(); // Build the inside of the set.
376  EnsureNext(']'); // Make sure last char is a close-bracket and advance.
377  break;
378  case '"':
379  result = ConstructString(); // Build the inside of the string.
380  EnsureNext('"'); // Make sure last char is a quote and advance.
381  break;
382  case '\\':
383  c = regex[pos++]; // Identify the specific escape char.
384  switch(c) {
385  case 'n': c = '\n'; break;
386  case 'r': c = '\r'; break;
387  case 't': c = '\t'; break;
388  // Any of these characters should just be themselves!
389  case '\\':
390  case '\"':
391  case '*':
392  case '+':
393  case '?':
394  case '.':
395  case '|':
396  case '(':
397  case ')':
398  case '[':
399  case ']':
400  break;
401  default:
402  Error("Unknown escape char for regex: '\\", c, "'.");
403  }
404  result = NewPtr<re_string>(c);
405  break;
406 
407  // Error cases
408  case '|':
409  case '*':
410  case '+':
411  case '?':
412  case ')':
413  Error("Expected regex segment but got '", c, "' at position ", pos, ".");
414  result = NewPtr<re_string>(c);
415  break;
416 
417  default:
418  // Take this char directly.
419  result = NewPtr<re_string>(c);
420  }
421 
422  emp_assert(result != nullptr);
423  return result;
424  }
425 
427  Ptr<re_block> Process(Ptr<re_block> cur_block=nullptr) {
428  emp_assert(pos >= 0 && pos < regex.size(), pos, regex.size());
429 
430  // If caller does not provide current block, create one (and return it.)
431  if (cur_block==nullptr) cur_block = NewPtr<re_block>();
432 
433  // All blocks need to start with a single token.
434  cur_block->push( ConstructSegment() );
435 
436  while (pos < regex.size()) {
437  const char c = regex[pos++];
438  switch (c) {
439  // case '|': cur_block->push( new re_or( cur_block->pop(), ConstructSegment() ) ); break;
440  case '|': cur_block->push( NewPtr<re_or>( cur_block->pop(), Process() ) ); break;
441  case '*': cur_block->push( NewPtr<re_star>( cur_block->pop() ) ); break;
442  case '+': cur_block->push( NewPtr<re_plus>( cur_block->pop() ) ); break;
443  case '?': cur_block->push( NewPtr<re_qm>( cur_block->pop() ) ); break;
444  case ')': pos--; return cur_block; // Must be ending segment (restore pos to check on return)
445 
446  default: // Must be a regular "segment"
447  pos--; // Restore to previous char to construct the next seqment.
448  cur_block->push( ConstructSegment() );
449  }
450  }
451 
452  return cur_block;
453  }
454 
455  public:
456  RegEx() = delete;
457  RegEx(const std::string & r)
458  : regex(r), notes(), valid(true), pos(0), dfa(), dfa_ready(false), head() {
459  Process(ToPtr(&head));
460  while(head.Simplify());
461  }
462  RegEx(const RegEx & r)
463  : regex(r.regex), notes(), valid(true), pos(0), dfa(), dfa_ready(false), head() {
464  Process(ToPtr(&head));
465  while(head.Simplify());
466  }
467  ~RegEx() { ; }
468 
470  RegEx & operator=(const RegEx & r) {
471  regex = r.regex;
472  notes.resize(0);
473  valid = true;
474  pos = 0;
475  head.Clear();
476  Process(ToPtr(&head));
477  while (head.Simplify());
478  return *this;
479  }
480 
482  std::string AsString() const { return to_literal(regex); }
483 
485  void AddToNFA(NFA & nfa, size_t start, size_t stop) const { head.AddToNFA(nfa, start, stop); }
486 
488  void Generate() const;
489 
491  bool Test(const std::string & str) const {
492  if (!dfa_ready) Generate();
493  return dfa.Test(str);
494  }
495 
497  void PrintInternal() { head.Print(std::cout); std::cout << std::endl; }
498 
500  void PrintNotes() {
501  for (const std::string & n : notes) {
502  std::cout << n << std::endl;
503  }
504  }
505 
507  void PrintDebug() {
508  if (notes.size()) {
509  std::cout << "NOTES:" << std::endl;
510  PrintNotes();
511  }
512  std::cout << "RegEx: " << to_escaped_string(regex) << std::endl;
513  std::cout << "INTERNAL: ";
514  PrintInternal();
515  }
516  };
517 
518 
520  static NFA to_NFA(const RegEx & regex, size_t stop_id=1) {
521  NFA nfa(2); // State 0 = start, state 1 = stop.
522  nfa.SetStop(1, stop_id);
523  regex.AddToNFA(nfa, 0, 1);
524  return nfa;
525  }
526 
528  static DFA to_DFA(const RegEx & regex) {
529  return to_DFA( to_NFA(regex) );
530  }
531 
532  void RegEx::Generate() const {
533  dfa = to_DFA(*this);
534  dfa_ready = true;
535  }
536 }
537 
538 #endif
Ptr< T > ToPtr(T *_in, bool own=false)
Convert a T* to a Ptr<T>. By default, don&#39;t track.
Definition: Ptr.h:816
static const NFA & to_NFA(const NFA &nfa)
Converting NFA to MFA – no change needed.
Definition: lexer_utils.h:30
stop_t Test(const std::string &str) const
Determine if an entire series of symbols is valid.
Definition: DFA.h:103
RegEx(const std::string &r)
Definition: RegEx.h:457
RegEx()=delete
std::string to_string(ALL_TYPES &&...all_values)
Definition: string_utils.h:511
A basic regular expression handler.
Definition: RegEx.h:55
void Delete()
Definition: Ptr.h:737
RegEx(const RegEx &r)
Definition: RegEx.h:462
A dynamic NFA class, for easily building non-determanistic finite automata.
Definition: NFA.h:37
size_t CountOnes() const
Count the number of ones in the BitSet using bit tricks for a speedup.
Definition: BitSet.h:371
void AddFreeTransition(size_t from, size_t to)
Create a free transition between &#39;from&#39; and &#39;to&#39;.
Definition: NFA.h:152
void Generate() const
Assume the RegEx is ready and setup processing for it.
Definition: RegEx.h:532
void push_back(PB_Ts &&...args)
Definition: vector.h:189
std::string AsString() const
Convert the RegEx to an standard string, readable from outsite this class.
Definition: RegEx.h:482
void AddTransition(size_t from, size_t to, size_t sym)
Add a transition between states &#39;from&#39; and &#39;to&#39; that can be taken with the provided symbol...
Definition: NFA.h:131
Simple functions to manipulate strings.
size_t size() const
Definition: vector.h:151
static const DFA & to_DFA(const DFA &dfa)
Converting DFA to DFA – no change needed.
Definition: lexer_utils.h:27
emp::vector< size_t > GetOnes() const
Return a vector indicating the posistions of all ones in the BitSet.
Definition: BitSet.h:402
static const PrintStr endl("<br>")
Pre-define emp::endl to insert a "<br>" and thus acting like a newline.
void pop_back()
Definition: vector.h:194
void AddToNFA(NFA &nfa, size_t start, size_t stop) const
Add this regex to an NFA being built.
Definition: RegEx.h:485
std::string to_literal(const LIT_TYPE &value)
Take a value and convert it to a C++-style literal.
Definition: string_utils.h:99
size_t AddNewState()
Add a new state into the NFA and return its id.
Definition: NFA.h:128
A drop-in replacement for std::bitset, with additional bit magic features.
~RegEx()
Definition: RegEx.h:467
void resize(size_t new_size)
Definition: vector.h:161
int FindBit() const
Return the index of the first one in the sequence; return -1 if no ones are available.
Definition: BitSet.h:374
static std::string to_escaped_string(char value)
Convert a single chararcter to one that uses a proper escape sequence (in a string) if needed...
Definition: string_utils.h:36
void PrintInternal()
For debugging: print the internal representation of the regex.
Definition: RegEx.h:497
If we are in emscripten, make sure to include the header.
Definition: array.h:37
Definition: Ptr.h:711
void SetStop(size_t state, T stop_val=1)
Set the specified state to be a stop state (with an optional stop value.)
Definition: NFA.h:174
#define emp_assert(...)
Definition: assert.h:199
T & back()
Definition: vector.h:183
RegEx & operator=(const RegEx &r)
Set this RegEx equal to another.
Definition: RegEx.h:470
void PrintDebug()
Print general debuging information about this regex.
Definition: RegEx.h:507
void PrintNotes()
For debugging: print any internal notes generated about this regex.
Definition: RegEx.h:500
bool Test(const std::string &str) const
Test if a string statisfies this regex.
Definition: RegEx.h:491
void Print(const emp::vector< T > &v, std::ostream &os=std::cout, const std::string &spacer=" ")
Print the contects of a vector.
Definition: vector_utils.h:35
A set of utilities to convert between NFAs and DFAs.
BitSet & NOT_SELF()
Perform a Boolean NOT on this BitSet, store result here, and return this object.
Definition: BitSet.h:467
constexpr size_t GetSize(T(&)[N])
Determine the size of a built-in array.
Definition: functions.h:81
A Non-deterministic Finite Automata simulator.