Class: AppQuery::Tokenizer
- Inherits:
-
Object
- Object
- AppQuery::Tokenizer
- Defined in:
- lib/app_query/tokenizer.rb
Defined Under Namespace
Classes: LexError
Instance Attribute Summary collapse
-
#input ⇒ Object
readonly
Returns the value of attribute input.
-
#pos ⇒ Object
readonly
Returns the value of attribute pos.
-
#start ⇒ Object
readonly
Returns the value of attribute start.
-
#tokens ⇒ Object
readonly
Returns the value of attribute tokens.
Class Method Summary collapse
Instance Method Summary collapse
- #chars_read ⇒ Object
- #emit_token(t, v: nil) ⇒ Object
- #eos? ⇒ Boolean
- #err(msg) ⇒ Object
-
#initialize(input, state: nil, start: nil, pos: nil) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #last_emitted(ignore:) ⇒ Object
- #last_emitted?(ignore_whitespace: true, ignore: [], **kws) ⇒ Boolean
- #lex_append_cte ⇒ Object
- #lex_comment ⇒ Object
- #lex_cte ⇒ Object
- #lex_cte_columns ⇒ Object
- #lex_cte_identifier ⇒ Object
- #lex_cte_select ⇒ Object
- #lex_maybe_materialized ⇒ Object
- #lex_prepend_cte ⇒ Object
- #lex_recursive_cte ⇒ Object
-
#lex_select ⇒ Object
there should always be a SELECT.
- #lex_sql ⇒ Object
-
#lex_whitespace ⇒ Object
optional.
- #lex_with ⇒ Object
- #match?(re) ⇒ Boolean
- #match_comment? ⇒ Boolean
- #push_return(*steps) ⇒ Object
- #read_char(n = 1) ⇒ Object
- #read_until(pattern) ⇒ Object
- #rest ⇒ Object
- #run(pos: nil) ⇒ Object
- #step ⇒ Object
Constructor Details
#initialize(input, state: nil, start: nil, pos: nil) ⇒ Tokenizer
Returns a new instance of Tokenizer.
13 14 15 16 17 18 19 |
# File 'lib/app_query/tokenizer.rb', line 13 def initialize(input, state: nil, start: nil, pos: nil) @input = input @tokens = [] @start = start || 0 @pos = pos || @start @return = Array(state || :lex_sql) end |
Instance Attribute Details
#input ⇒ Object (readonly)
Returns the value of attribute input.
7 8 9 |
# File 'lib/app_query/tokenizer.rb', line 7 def input @input end |
#pos ⇒ Object (readonly)
Returns the value of attribute pos.
7 8 9 |
# File 'lib/app_query/tokenizer.rb', line 7 def pos @pos end |
#start ⇒ Object (readonly)
Returns the value of attribute start.
7 8 9 |
# File 'lib/app_query/tokenizer.rb', line 7 def start @start end |
#tokens ⇒ Object (readonly)
Returns the value of attribute tokens.
7 8 9 |
# File 'lib/app_query/tokenizer.rb', line 7 def tokens @tokens end |
Class Method Details
.tokenize ⇒ Object
9 10 11 |
# File 'lib/app_query/tokenizer.rb', line 9 def self.tokenize(...) new(...).run end |
Instance Method Details
#chars_read ⇒ Object
36 37 38 |
# File 'lib/app_query/tokenizer.rb', line 36 def chars_read input[start...pos] end |
#emit_token(t, v: nil) ⇒ Object
53 54 55 56 57 |
# File 'lib/app_query/tokenizer.rb', line 53 def emit_token(t, v: nil) @tokens << {v: v || chars_read, t: t, start: start, end: pos} @start = @pos self end |
#eos? ⇒ Boolean
32 33 34 |
# File 'lib/app_query/tokenizer.rb', line 32 def eos? pos == input.size end |
#err(msg) ⇒ Object
21 22 23 24 25 26 27 28 29 30 |
# File 'lib/app_query/tokenizer.rb', line 21 def err(msg) linepos = linepos_by_pos[pos] || linepos_by_pos[pos.pred] msg += <<~ERR #{input} #{" " * linepos}^ ERR raise LexError, msg end |
#last_emitted(ignore:) ⇒ Object
123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/app_query/tokenizer.rb', line 123 def last_emitted(ignore:) if ignore.none? @tokens.last else t = @tokens.dup while (result = t.pop) break if !ignore.include?(result[:t]) end result end end |
#last_emitted?(ignore_whitespace: true, ignore: [], **kws) ⇒ Boolean
135 136 137 138 139 140 141 142 143 144 |
# File 'lib/app_query/tokenizer.rb', line 135 def last_emitted?(ignore_whitespace: true, ignore: [], **kws) ignore = if ignore.any? ignore elsif ignore_whitespace %w[COMMENT WHITESPACE] else [] end last_emitted(ignore:)&.slice(*kws.keys) == kws end |
#lex_append_cte ⇒ Object
105 106 107 108 109 |
# File 'lib/app_query/tokenizer.rb', line 105 def lex_append_cte emit_token "COMMA", v: "," emit_token "WHITESPACE", v: "\n " push_return :lex_recursive_cte end |
#lex_comment ⇒ Object
305 306 307 308 309 310 311 312 313 314 315 316 317 318 |
# File 'lib/app_query/tokenizer.rb', line 305 def lex_comment err "Expected comment, i.e. '--' or '/*'" unless match_comment? if match?("--") read_until(/\n/) else read_until %r{\*/} err "Expected comment close '*/'." if eos? read_char 2 end emit_token "COMMENT" push_return :lex_whitespace end |
#lex_cte ⇒ Object
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
# File 'lib/app_query/tokenizer.rb', line 146 def lex_cte if match_comment? push_return :lex_cte, :lex_comment elsif last_emitted? t: "CTE_IDENTIFIER", ignore_whitespace: true if match?(/AS(\s|\()/i) read_char 2 emit_token "AS" push_return :lex_cte, :lex_cte_select, :lex_maybe_materialized, :lex_whitespace elsif match?(%r{\(}) # "foo " "(id)" push_return :lex_cte, :lex_cte_columns else err "Expected 'AS' or CTE columns following CTE-identifier, e.g. 'foo AS' 'foo()'" end elsif last_emitted? t: "CTE_COLUMNS_CLOSE", ignore_whitespace: true if match?(/AS(\s|\()/i) read_char 2 emit_token "AS" push_return :lex_cte, :lex_cte_select, :lex_maybe_materialized, :lex_whitespace else err "Expected 'AS' following CTE-columns" end elsif last_emitted? t: "CTE_SELECT", ignore_whitespace: true if match?(/,/) # but wait, there's more! read_char emit_token "CTE_COMMA" push_return :lex_cte, :lex_whitespace end else push_return :lex_cte, :lex_cte_identifier end end |
#lex_cte_columns ⇒ Object
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 |
# File 'lib/app_query/tokenizer.rb', line 204 def lex_cte_columns err "Expected CTE columns, e.g. '(id, other)'" unless match? %r{\(} read_char read_until(/\S/) emit_token "CTE_COLUMNS_OPEN" loop do if match?(/\)/) err "Expected a column name" unless last_emitted? t: "CTE_COLUMN" read_char emit_token "CTE_COLUMNS_CLOSE" break elsif match?(/,/) # "( " "," err "Expected a column name" unless last_emitted? t: "CTE_COLUMN" read_char # ',' read_until(/\S/) emit_token "CTE_COLUMN_DIV" elsif match?(/"/) unless last_emitted? t: "CTE_COLUMNS_OPEN" err "Expected comma" unless last_emitted? t: "CTE_COLUMN_DIV" end read_char read_until(/"/) read_char emit_token "CTE_COLUMN" elsif match?(/[_A-Za-z]/) unless last_emitted? t: "CTE_COLUMNS_OPEN" err "Expected comma" unless last_emitted? t: "CTE_COLUMN_DIV" end read_until %r{,|\s|\)} emit_token "CTE_COLUMN" elsif match?(/\s/) read_until(/\S/) else # e.g. "(id," "1)" or eos? err "Expected valid column name" end end push_return :lex_whitespace end |
#lex_cte_identifier ⇒ Object
279 280 281 282 283 284 285 286 287 288 289 290 291 292 |
# File 'lib/app_query/tokenizer.rb', line 279 def lex_cte_identifier err "Expected CTE identifier, e.g. 'foo', '\"foo bar\"' " unless match? %r{[_"A-Za-z]} if match?(/"/) read_char read_until(/"/) read_char else read_until %r{\s|\(} end emit_token "CTE_IDENTIFIER" push_return :lex_whitespace end |
#lex_cte_select ⇒ Object
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 |
# File 'lib/app_query/tokenizer.rb', line 254 def lex_cte_select err "Expected CTE select, e.g. '(select 1)'" unless match? %r{\(} read_char level = 1 loop do read_until(/\)|\(/) if eos? err "CTE select ended prematurely" elsif match?(/\(/) level += 1 elsif match?(/\)/) level -= 1 break if level.zero? end read_char end err "Expected non-empty CTE select, e.g. '(select 1)'" if chars_read.strip == "(" read_char emit_token "CTE_SELECT" push_return :lex_whitespace end |
#lex_maybe_materialized ⇒ Object
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
# File 'lib/app_query/tokenizer.rb', line 182 def lex_maybe_materialized if match?(/materialized/i) read_until(/\(/) emit_token "MATERIALIZED" elsif match?(%r{\(}) # done elsif match?(/not\s/i) read_char 3 read_until(/\S/) emit_token "NOT_MATERIALIZED" err "Expected 'MATERIALIZED'" unless match?(/materialized/i) push_return :lex_maybe_materialized else err "Expected CTE select or NOT? MATERIALIZED" end end |
#lex_prepend_cte ⇒ Object
94 95 96 97 98 99 100 101 102 103 |
# File 'lib/app_query/tokenizer.rb', line 94 def lex_prepend_cte if eos? emit_token "COMMA", v: "," emit_token "WHITESPACE", v: "\n" elsif match?(/\s/) push_return :lex_prepend_cte, :lex_whitespace else push_return :lex_prepend_cte, :lex_recursive_cte end end |
#lex_recursive_cte ⇒ Object
111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/app_query/tokenizer.rb', line 111 def lex_recursive_cte if match?(/recursive\s/i) read_until(/\s/) # make trailing whitespace part of next token # this makes adding cte's easier read_until(/\S/) emit_token "RECURSIVE" end push_return :lex_cte end |
#lex_select ⇒ Object
there should always be a SELECT
295 296 297 298 299 300 301 302 303 |
# File 'lib/app_query/tokenizer.rb', line 295 def lex_select read_until(/\Z/) read_char if last_emitted? t: "COMMENT", ignore_whitespace: false emit_token "WHITESPACE", v: "\n" end emit_token "SELECT" end |
#lex_sql ⇒ Object
71 72 73 74 75 76 77 78 79 80 81 82 83 |
# File 'lib/app_query/tokenizer.rb', line 71 def lex_sql if last_emitted? t: "CTE_SELECT", ignore: %w[WHITESPACE COMMENT] push_return :lex_select elsif match?(/\s/) push_return :lex_sql, :lex_whitespace elsif match_comment? push_return :lex_sql, :lex_comment elsif match?(/with/i) push_return :lex_sql, :lex_with else push_return :lex_select end end |
#lex_whitespace ⇒ Object
optional
321 322 323 324 325 326 327 |
# File 'lib/app_query/tokenizer.rb', line 321 def lex_whitespace if match?(/\s/) read_until(/\S/) emit_token "WHITESPACE" end end |
#lex_with ⇒ Object
85 86 87 88 89 90 91 92 |
# File 'lib/app_query/tokenizer.rb', line 85 def lex_with err "Expected 'WITH'" unless match? %r{WITH\s}i read_until(/\s/) read_until(/\S/) emit_token "WITH" push_return :lex_recursive_cte end |
#match?(re) ⇒ Boolean
49 50 51 |
# File 'lib/app_query/tokenizer.rb', line 49 def match?(re) rest[Regexp.new("\\A%s" % re)] end |
#match_comment? ⇒ Boolean
200 201 202 |
# File 'lib/app_query/tokenizer.rb', line 200 def match_comment? match?(%r{--|/\*}) end |
#push_return(*steps) ⇒ Object
59 60 61 62 |
# File 'lib/app_query/tokenizer.rb', line 59 def push_return(*steps) (@return ||= []).push(*steps) self end |
#read_char(n = 1) ⇒ Object
40 41 42 43 |
# File 'lib/app_query/tokenizer.rb', line 40 def read_char(n = 1) @pos = [pos + n, input.size].min self end |
#read_until(pattern) ⇒ Object
64 65 66 67 68 69 |
# File 'lib/app_query/tokenizer.rb', line 64 def read_until(pattern) loop do break if match?(pattern) || eos? read_char end end |
#rest ⇒ Object
45 46 47 |
# File 'lib/app_query/tokenizer.rb', line 45 def rest input[pos...] end |
#run(pos: nil) ⇒ Object
329 330 331 332 333 334 |
# File 'lib/app_query/tokenizer.rb', line 329 def run(pos: nil) loop do break if step.nil? end eos? ? tokens : self end |
#step ⇒ Object
336 337 338 339 340 341 |
# File 'lib/app_query/tokenizer.rb', line 336 def step if (state = @return.pop) method(state).call self end end |