Class: AppQuery::Tokenizer

Inherits:

Object

Object
AppQuery::Tokenizer

show all

Defined in:: lib/app_query/tokenizer.rb

Defined Under Namespace

Classes: LexError

Instance Attribute Summary collapse

#input ⇒ Object readonly
Returns the value of attribute input.
#pos ⇒ Object readonly
Returns the value of attribute pos.
#start ⇒ Object readonly
Returns the value of attribute start.
#tokens ⇒ Object readonly
Returns the value of attribute tokens.

Class Method Summary collapse

.tokenize ⇒ Object

Instance Method Summary collapse

#chars_read ⇒ Object
#emit_token(t, v: nil) ⇒ Object
#eos? ⇒ Boolean
#err(msg) ⇒ Object
#initialize(input, state: nil, start: nil, pos: nil) ⇒ Tokenizer constructor
A new instance of Tokenizer.
#last_emitted(ignore:) ⇒ Object
#last_emitted?(ignore_whitespace: true, ignore: [], **kws) ⇒ Boolean
#lex_append_cte ⇒ Object
#lex_comment ⇒ Object
#lex_cte ⇒ Object
#lex_cte_columns ⇒ Object
#lex_cte_identifier ⇒ Object
#lex_cte_select ⇒ Object
#lex_maybe_materialized ⇒ Object
#lex_prepend_cte ⇒ Object
#lex_recursive_cte ⇒ Object
#lex_select ⇒ Object
there should always be a SELECT.
#lex_sql ⇒ Object
#lex_whitespace ⇒ Object
optional.
#lex_with ⇒ Object
#match?(re) ⇒ Boolean
#match_comment? ⇒ Boolean
#push_return(*steps) ⇒ Object
#read_char(n = 1) ⇒ Object
#read_until(pattern) ⇒ Object
#rest ⇒ Object
#run(pos: nil) ⇒ Object
#step ⇒ Object

Constructor Details

#initialize(input, state: nil, start: nil, pos: nil) ⇒ `Tokenizer`

Returns a new instance of Tokenizer.

# File 'lib/app_query/tokenizer.rb', line 13

def initialize(input, state: nil, start: nil, pos: nil)
  @input = input
  @tokens = []
  @start = start || 0
  @pos = pos || @start
  @return = Array(state || :lex_sql)
end

Instance Attribute Details

#input ⇒ `Object` (readonly)

Returns the value of attribute input.



7
8
9

# File 'lib/app_query/tokenizer.rb', line 7

def input
  @input
end

#pos ⇒ `Object` (readonly)

Returns the value of attribute pos.



7
8
9

# File 'lib/app_query/tokenizer.rb', line 7

def pos
  @pos
end

#start ⇒ `Object` (readonly)

Returns the value of attribute start.



7
8
9

# File 'lib/app_query/tokenizer.rb', line 7

def start
  @start
end

#tokens ⇒ `Object` (readonly)

Returns the value of attribute tokens.



7
8
9

# File 'lib/app_query/tokenizer.rb', line 7

def tokens
  @tokens
end

Class Method Details

.tokenize ⇒ `Object`



9
10
11

# File 'lib/app_query/tokenizer.rb', line 9

def self.tokenize(...)
  new(...).run
end

Instance Method Details

#chars_read ⇒ `Object`



36
37
38

# File 'lib/app_query/tokenizer.rb', line 36

def chars_read
  input[start...pos]
end

#emit_token(t, v: nil) ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 53

def emit_token(t, v: nil)
  @tokens << {v: v || chars_read, t: t, start: start, end: pos}
  @start = @pos
  self
end

#eos? ⇒ `Boolean`

Returns:

(Boolean)



32
33
34

# File 'lib/app_query/tokenizer.rb', line 32

def eos?
  pos == input.size
end

#err(msg) ⇒ `Object`

Raises:

(LexError)

# File 'lib/app_query/tokenizer.rb', line 21

def err(msg)
  linepos = linepos_by_pos[pos] || linepos_by_pos[pos.pred]

  msg += <<~ERR

    #{input}
    #{" " * linepos}^
  ERR
  raise LexError, msg
end

#last_emitted(ignore:) ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 123

def last_emitted(ignore:)
  if ignore.none?
    @tokens.last
  else
    t = @tokens.dup
    while (result = t.pop)
      break if !ignore.include?(result[:t])
    end
    result
  end
end

#last_emitted?(ignore_whitespace: true, ignore: [], **kws) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/app_query/tokenizer.rb', line 135

def last_emitted?(ignore_whitespace: true, ignore: [], **kws)
  ignore = if ignore.any?
    ignore
  elsif ignore_whitespace
    %w[COMMENT WHITESPACE]
  else
    []
  end
  last_emitted(ignore:)&.slice(*kws.keys) == kws
end

#lex_append_cte ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 105

def lex_append_cte
  emit_token "COMMA", v: ","
  emit_token "WHITESPACE", v: "\n"
  push_return :lex_recursive_cte
end

#lex_comment ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 315

def lex_comment
  err "Expected comment, i.e. '--' or '/*'" unless match_comment?

  if match?("--")
    read_until(/\n/)
  else
    read_until %r{\*/}
    err "Expected comment close '*/'." if eos?
    read_char 2
  end

  emit_token "COMMENT"
  push_return :lex_whitespace
end

#lex_cte ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 146

def lex_cte
  if match_comment?
    push_return :lex_cte, :lex_comment
  elsif last_emitted? t: "CTE_IDENTIFIER", ignore_whitespace: true
    if match?(/AS(\s|\()/i)
      read_char 2
      emit_token "AS"

      push_return :lex_cte, :lex_cte_select, :lex_maybe_materialized, :lex_whitespace
    elsif match?(%r{\(})
      # "foo " "(id)"
      push_return :lex_cte, :lex_cte_columns
    else
      err "Expected 'AS' or CTE columns following CTE-identifier, e.g. 'foo AS' 'foo()'"
    end
  elsif last_emitted? t: "CTE_COLUMNS_CLOSE", ignore_whitespace: true
    if match?(/AS(\s|\()/i)
      read_char 2
      emit_token "AS"

      push_return :lex_cte, :lex_cte_select, :lex_maybe_materialized, :lex_whitespace
    else
      err "Expected 'AS' following CTE-columns"
    end
  elsif last_emitted? t: "CTE_SELECT", ignore_whitespace: true
    if match?(/,/)
      # but wait, there's more!
      read_char
      emit_token "CTE_COMMA"
      push_return :lex_cte, :lex_whitespace
    end
  else
    push_return :lex_cte, :lex_cte_identifier
  end
end

#lex_cte_columns ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 204

def lex_cte_columns
  err "Expected CTE columns, e.g. '(id, other)'" unless match? %r{\(}

  read_char
  read_until(/\S/)
  emit_token "CTE_COLUMNS_OPEN"

  loop do
    if match?(/\)/)
      err "Expected a column name" unless last_emitted? t: "CTE_COLUMN"

      read_char
      emit_token "CTE_COLUMNS_CLOSE"
      break
    elsif match?(/,/)
      # "( " ","
      err "Expected a column name" unless last_emitted? t: "CTE_COLUMN"
      read_char # ','

      read_until(/\S/)
      emit_token "CTE_COLUMN_DIV"
    elsif match?(/"/)
      unless last_emitted? t: "CTE_COLUMNS_OPEN"
        err "Expected comma" unless last_emitted? t: "CTE_COLUMN_DIV"
      end

      read_char
      read_until(/"/)
      read_char

      emit_token "CTE_COLUMN"
    elsif match?(/[_A-Za-z]/)
      unless last_emitted? t: "CTE_COLUMNS_OPEN"
        err "Expected comma" unless last_emitted? t: "CTE_COLUMN_DIV"
      end

      read_until %r{,|\s|\)}

      emit_token "CTE_COLUMN"
    elsif match?(/\s/)
      read_until(/\S/)
    else
      # e.g. "(id," "1)" or eos?
      err "Expected valid column name"
    end
  end

  push_return :lex_whitespace
end

#lex_cte_identifier ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 289

def lex_cte_identifier
  err "Expected CTE identifier, e.g. 'foo', '\"foo bar\"' " unless match? %r{[_"A-Za-z]}

  if match?(/"/)
    read_char
    read_until(/"/)
    read_char
  else
    read_until %r{\s|\(}
  end
  emit_token "CTE_IDENTIFIER"

  push_return :lex_whitespace
end

#lex_cte_select ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 254

def lex_cte_select
  err "Expected CTE select, e.g. '(select 1)'" unless match? %r{\(}
  read_char

  level = 1
  loop do
    read_until(/\)|\(|'/)
    if eos?
      err "CTE select ended prematurely"
    elsif match?(/'/)
      # Skip string literal (handle escaped quotes '')
      read_char
      loop do
        read_until(/'/)
        read_char
        break unless match?(/'/) # '' is escaped quote, continue
        read_char
      end
    elsif match?(/\(/)
      level += 1
      read_char
    elsif match?(/\)/)
      level -= 1
      break if level.zero?
      read_char
    end
  end

  err "Expected non-empty CTE select, e.g. '(select 1)'" if chars_read.strip == "("
  read_char
  emit_token "CTE_SELECT"

  push_return :lex_whitespace
end

#lex_maybe_materialized ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 182

def lex_maybe_materialized
  if match?(/materialized/i)
    read_until(/\(/)
    emit_token "MATERIALIZED"
  elsif match?(%r{\(})
    # done
  elsif match?(/not\s/i)
    read_char 3
    read_until(/\S/)
    emit_token "NOT_MATERIALIZED"
    err "Expected 'MATERIALIZED'" unless match?(/materialized/i)

    push_return :lex_maybe_materialized
  else
    err "Expected CTE select or NOT? MATERIALIZED"
  end
end

#lex_prepend_cte ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 94

def lex_prepend_cte
  if eos?
    emit_token "COMMA", v: ","
    emit_token "WHITESPACE", v: "\n"
  elsif match?(/\s/)
    push_return :lex_prepend_cte, :lex_whitespace
  else
    push_return :lex_prepend_cte, :lex_recursive_cte
  end
end

#lex_recursive_cte ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 111

def lex_recursive_cte
  if match?(/recursive\s/i)
    read_until(/\s/)
    # make trailing whitespace part of next token
    # this makes adding cte's easier
    read_until(/\S/)
    emit_token "RECURSIVE"
  end

  push_return :lex_cte
end

#lex_select ⇒ `Object`

there should always be a SELECT

# File 'lib/app_query/tokenizer.rb', line 305

def lex_select
  read_until(/\Z/)
  read_char

  if last_emitted? t: "COMMENT", ignore_whitespace: false
    emit_token "WHITESPACE", v: "\n"
  end
  emit_token "SELECT"
end

#lex_sql ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 71

def lex_sql
  if last_emitted? t: "CTE_SELECT", ignore: %w[WHITESPACE COMMENT]
    push_return :lex_select
  elsif match?(/\s/)
    push_return :lex_sql, :lex_whitespace
  elsif match_comment?
    push_return :lex_sql, :lex_comment
  elsif match?(/with/i)
    push_return :lex_sql, :lex_with
  else
    push_return :lex_select
  end
end

#lex_whitespace ⇒ `Object`

optional

# File 'lib/app_query/tokenizer.rb', line 331

def lex_whitespace
  if match?(/\s/)
    read_until(/\S/)

    emit_token "WHITESPACE"
  end
end

#lex_with ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 85

def lex_with
  err "Expected 'WITH'" unless match? %r{WITH\s}i
  read_until(/\s/)
  read_until(/\S/)
  emit_token "WITH"

  push_return :lex_recursive_cte
end

#match?(re) ⇒ `Boolean`

Returns:

(Boolean)



49
50
51

# File 'lib/app_query/tokenizer.rb', line 49

def match?(re)
  rest[Regexp.new("\\A%s" % re)]
end

#match_comment? ⇒ `Boolean`

Returns:

(Boolean)



200
201
202

# File 'lib/app_query/tokenizer.rb', line 200

def match_comment?
  match?(%r{--|/\*})
end

#push_return(*steps) ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 59

def push_return(*steps)
  (@return ||= []).push(*steps)
  self
end

#read_char(n = 1) ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 40

def read_char(n = 1)
  @pos = [pos + n, input.size].min
  self
end

#read_until(pattern) ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 64

def read_until(pattern)
  loop do
    break if match?(pattern) || eos?
    read_char
  end
end

#rest ⇒ `Object`



45
46
47

# File 'lib/app_query/tokenizer.rb', line 45

def rest
  input[pos...]
end

#run(pos: nil) ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 339

def run(pos: nil)
  loop do
    break if step.nil?
  end
  eos? ? tokens : self
end

#step ⇒ `Object`

# File 'lib/app_query/tokenizer.rb', line 346

def step
  if (state = @return.pop)
    method(state).call
    self
  end
end

Class: AppQuery::Tokenizer

Defined Under Namespace

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input, state: nil, start: nil, pos: nil) ⇒ Tokenizer

Instance Attribute Details

#input ⇒ Object (readonly)

#pos ⇒ Object (readonly)

#start ⇒ Object (readonly)

#tokens ⇒ Object (readonly)

Class Method Details

.tokenize ⇒ Object

Instance Method Details

#chars_read ⇒ Object

#emit_token(t, v: nil) ⇒ Object

#eos? ⇒ Boolean

#err(msg) ⇒ Object

#last_emitted(ignore:) ⇒ Object

#last_emitted?(ignore_whitespace: true, ignore: [], **kws) ⇒ Boolean

#lex_append_cte ⇒ Object

#lex_comment ⇒ Object

#lex_cte ⇒ Object

#lex_cte_columns ⇒ Object

#lex_cte_identifier ⇒ Object

#lex_cte_select ⇒ Object

#lex_maybe_materialized ⇒ Object

#lex_prepend_cte ⇒ Object

#lex_recursive_cte ⇒ Object

#lex_select ⇒ Object

#lex_sql ⇒ Object

#lex_whitespace ⇒ Object

#lex_with ⇒ Object

#match?(re) ⇒ Boolean

#match_comment? ⇒ Boolean

#push_return(*steps) ⇒ Object

#read_char(n = 1) ⇒ Object

#read_until(pattern) ⇒ Object

#rest ⇒ Object

#run(pos: nil) ⇒ Object

#step ⇒ Object

#initialize(input, state: nil, start: nil, pos: nil) ⇒ `Tokenizer`

#input ⇒ `Object` (readonly)

#pos ⇒ `Object` (readonly)

#start ⇒ `Object` (readonly)

#tokens ⇒ `Object` (readonly)

.tokenize ⇒ `Object`

#chars_read ⇒ `Object`

#emit_token(t, v: nil) ⇒ `Object`

#eos? ⇒ `Boolean`

#err(msg) ⇒ `Object`

#last_emitted(ignore:) ⇒ `Object`

#last_emitted?(ignore_whitespace: true, ignore: [], **kws) ⇒ `Boolean`

#lex_append_cte ⇒ `Object`

#lex_comment ⇒ `Object`

#lex_cte ⇒ `Object`

#lex_cte_columns ⇒ `Object`

#lex_cte_identifier ⇒ `Object`

#lex_cte_select ⇒ `Object`

#lex_maybe_materialized ⇒ `Object`

#lex_prepend_cte ⇒ `Object`

#lex_recursive_cte ⇒ `Object`

#lex_select ⇒ `Object`

#lex_sql ⇒ `Object`

#lex_whitespace ⇒ `Object`

#lex_with ⇒ `Object`

#match?(re) ⇒ `Boolean`

#match_comment? ⇒ `Boolean`

#push_return(*steps) ⇒ `Object`

#read_char(n = 1) ⇒ `Object`

#read_until(pattern) ⇒ `Object`

#rest ⇒ `Object`

#run(pos: nil) ⇒ `Object`

#step ⇒ `Object`