module Wirb::Tokenizer

Public Class Methods

run(str) { |kind, token| ... } click to toggle source
# File lib/wirb/tokenizer.rb, line 3
def self.run(str)
  return [] if str.nil?
  raise ArgumentError, 'Tokenizer needs an inspect-string' unless str.is_a? String
  return enum_for(:run, str) unless block_given?

  chars = str.split('')

  @state, @token, i  =  [], '', 0
  @passed, snapshot  =  '', nil # exception handling

  # helpers
  pass_custom_state = lambda{ |kind, *options|
    yield kind, @token  unless @token.empty?
    @passed << @token
    @state.pop          if     options.include?(:remove)
    @token  = ''        unless options.include?(:keep_token)
    @repeat = true      if     options.include?(:repeat)
  }

  pass_state  = lambda{ |*options|
    pass_custom_state[ @state[-1], *options ]
  }

  pass        = lambda{ |kind, string|
    @passed << string
    yield kind, string
  }

  set_state   = lambda{ |state, *options|
    @state[-1] = state
    @repeat = true if options.include? :repeat
  }

  get_state   = lambda{ |state| @state[-1] == state }

  get_previous_state =
                lambda{ |state| @state[-2] == state }

  push_state  = lambda{ |state, *options|
    @state << state
    @repeat = true if options.include? :repeat
  }

  pop_state   = lambda{ |*options|
    @state.pop
    @repeat = true if options.include? :repeat
  }

  # action!
  while i <= chars.size
    @repeat = false
    llc, lc, c, nc = lc, c, chars[i], chars[i+1]

    # warn "char = #{c}  state = #{@state*':'}"

    case @state[-1]
    when nil, :hash, :array, :enumerator, :set, :variable # default state
      case c
      when '"'      then push_state[:string]
      when '/'      then push_state[:regexp]
      when '#'      then push_state[:object]
      when /[A-Z]/  then push_state[:class,    :repeat]
      when /[a-z]/  then push_state[:word,     :repeat]
      when /[0-9-]/ then push_state[:number,   :repeat]
      when '.'      then push_state[:range,    :repeat]
      when /\=/     then push_state[:equal,    :repeat]

      when /\s/
        if get_state[:variable]
          pop_state[:repeat]
        else
          pass[:whitespace, c]
        end

      when ','
        if get_state[:variable]
          pop_state[:repeat]
        else
          pass[:comma, ',']
          @refers_seen[-1] = false if get_state[:hash]
        end

      when ':'
        if get_state[:enumerator]
          set_state[:object_description, :repeat]
        else
          push_state[:symbol]
        end

      when '>'
        pop_state[:repeat]
      when '('
        peek = chars[i+1..-1].join
        if peek =~ /^-?(?:Infinity|NaN|[0-9.e]+)[+-](?:Infinity|NaN|[0-9.e]+)\*?i\)/
          push_state[:complex, :repeat]
        elsif nc =~ /[0-9-]/
          if @passed =~ /Complex$/ # cheat for old 1.8
            push_state[:complex, :repeat]
          else
            push_state[:rational, :repeat]
          end
        else
          push_state[:object_description, :repeat]
          open_brackets = 0
        end

      when '{'
        if get_state[:set]
          pass[:open_set, '{']; push_state[nil] # {{ means set-hash
        else
          pass[:open_hash, '{']; push_state[:hash]
          @refers_seen ||= []
          @refers_seen.push false
        end

      when '['
        pass[:open_array, '[']; push_state[:array]

      when ']'
        if get_state[:array]
          pass[:close_array, ']']
          pop_state[]
          pop_state[] if get_state[:enumerator]
        end

      when '}'
        if get_state[:hash]
          pass[:close_hash, '}']
          @refers_seen.pop
        elsif get_previous_state[:set]
          pass[:close_set, '}']
          pop_state[] # remove extra nil state
        end
        pop_state[]
        pop_state[] if get_state[:enumerator]

      when '<'
        pass[:open_object, '<']
        push_state[:object]
        push_state[:object_class]
        open_brackets = 0
      end

    when :class
      case c
      when /[a-z0-9_]/i
        @token << c
      else
        if @token =~ /^(Infinity|NaN)$/
          set_state[:special_number, :repeat]
        elsif c ==':' && nc == ':'
          pass_state[]
          pass[:class_separator, '::']
        elsif !(c == ':' && lc == ':')
          pass_state[:remove, :repeat]
        end
      end

    when :symbol
      case c
      when /"/
        if lc == '$' && llc == ':'
          @token << c
        else
          pass[:symbol_prefix, ':']
          set_state[:symbol_string]
        end
      when /[^"., }\])=]/
        @token << c
      else
        if c == ']' && lc == '['
          @token << c
        elsif c == '=' && nc != '>'
          @token << c
        elsif c =~ /[.,]/ && lc == '$' && llc == ':'
          @token << c
        else
          pass[:symbol_prefix, ':']
          pass_state[:remove, :repeat]
        end
      end

    when :symbol_string
      if c == '"' && ( !( @token =~ /\\+$/; $& ) || $&.size % 2 == 0 ) # see string
        pass[:open_symbol_string, '"']
        pass_state[:remove]
        pass[:close_symbol_string, '"']
      else
        @token << c
      end

    when :string
      if c == '"' && ( !( @token =~ /\\+$/; $& ) || $&.size % 2 == 0 ) # allow escaping of " and
        pass[:open_string, '"']                              # work around \\
        pass_state[:remove]
        pass[:close_string, '"']
      else
        @token << c
      end

    when :regexp
      if c == '/' && ( !( @token =~ /\\+$/; $& ) || $&.size % 2 == 0 ) # see string
        pass[:open_regexp, '/']
        pass_state[:remove]
        pass[:close_regexp, '/']
        push_state[:regexp_flags]
      else
        @token << c
      end

    when :regexp_flags
      if c =~ /[a-z]/i #*%w[m i x o n e u s]
        @token << c
      else
        pass_state[:remove, :repeat]
      end

    when :word
      if c =~ /[a-z0-9_]/i
        @token << c
        pass_custom_state[@token.to_sym, :remove] if %w[nil false true].include?(@token)
      else
        pass_state[:remove, :repeat]
      end

    when :number
      if c == '-' && @token != '' && @token[-1] != 'e'
        set_state[:time, :repeat]
      elsif c =~ /[IN]/
        set_state[:special_number, :repeat]
      elsif c =~ /[0-9e.*i+-]/ && !(c == '.' && nc == '.')
        @token << c
      else
        pass_state[:remove, :repeat]
      end

    when :time # via regex, state needs to be triggered somewhere else
      peek = chars[i..-1].join
      if [
        /^\d+-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(?:\.\d{,9})? (?:[+-]\d{4}|[a-z]{3})/i,
      ].any?{ |regex|
        ( @token + peek ) =~ regex
      } # found, adjust parsing-pointer:
        i = i + $&.size - @token.size - 1
        @token = $&
        pass_state[:remove]
      else
        @token << c
        pop_state[:remove]
      end

    when :special_number # like time, refactor if code is needed a third time
      peek = chars[i..-1].join
      if [
        /^[-+]?Infinity/,
        /^[-+]?NaN/,
      ].any?{ |regex|
        ( @token + peek ) =~ regex
      } # found, adjust parsing-pointer:
        i = i + $&.size - @token.size - 1
        @token = $&
        pass_state[]
        set_state[:number]
      else
        # TODO verify...
        @token << c
        set_state[:number]
      end

    when :range
      if c == '.'
        @token << c
      else
        pass_state[:remove, :repeat]
      end

    when :rational
      case c
      when '('
        pass[:open_rational, '(']
      when /[0-9-]/
        push_state[:number, :repeat]
      when '/', ','
        pass[:rational_separator, c]
      when ' '
        pass[:whitespace, c]
      when ')'
        pass[:close_rational, ')']
        pop_state[]
      end

    when :complex
      case c
      when '('
        pass[:open_complex, '(']
      when /[0-9+-]/
        push_state[:number, :repeat]
      when ','
        pass[:number, c] # complex_separator
      when ' '
        pass[:whitespace, c]
      when ')'
        pass[:close_complex, ')']
        pop_state[]
      end

    when :object
      case c
      when '<'
        pass[:open_object, '#<']
        push_state[:object_class]
        open_brackets = 0
      when '>'
        pass[:close_object, '>']
        pop_state[]
        pop_state[] if get_state[:enumerator]
      end

    when :object_class
      case c
      when /[a-z0-9_]/i
        @token << c
      when '>'
        pass_state[:remove, :repeat]
      else
        if c == ':' && nc == ':'
          pass_state[]
          pass[:class_separator, '::']
        elsif c != ':' || lc != ':'
          pass_state[:keep_token]
          pass[:object_description_prefix, c]

          @token = @token.downcase
          if %w[set instructionsequence].include?(@token)
            set_state[@token.to_sym]
          else
            set_state[:object_description]
            if @token == "enumerator" && RUBY_ENGINE != "rbx"
              push_state[:enumerator]
            end
          end
          @token = ''
        end
      end

    when :object_description
      case c
      when '>', nil
        if open_brackets == 0
          pass_state[:remove, :repeat]
        else
          open_brackets -= 1
          @token << c
        end
      when '#'
        if nc == '<'
          pass_state[]
          push_state[:object]
        else
          @token << c
        end
      when '<'
        open_brackets += 1
        @token << c
      when '@'
        if nc =~ /[a-z_]/i
          pass_state[]
          push_state[:object_variable]
        else
          @token << c
        end
      when '"'
        pass_state[]
        push_state[:string]
      when /[0-9]/
        if c == '0' && nc == 'x'
          pass_state[:repeat]
          push_state[:object_address]
        else
          # push_state[:number, :repeat]
          @token << c
        end
      else
        @token << c
      end

    when :object_variable
      if c =~ /[a-z0-9_]/i
        @token << c
      else
        pass[:object_variable_prefix, '@']
        pass_state[:remove]
        pass[:object_description, '=']
        push_state[:variable]
      end

    when :object_address
      if c =~ /[x0-9a-f]/
        @token << c
      else
        if c == '@' || c == ' ' && nc != "@"
          pass_state[:remove]
          push_state[:object_line]
          pass[:object_line_prefix, c]
        else
          pass_state[:remove, :repeat]
        end
      end

    when :object_line
      if c == ':' && nc =~ /[0-9]/
        @token << ':'
        pass_state[:remove]
        push_state[:object_line_number]
      elsif c == '>' # e.g. RubyVM
        pass_state[:remove, :repeat]
      else
        @token << c
      end

    when :object_line_number
      if c =~ /[0-9]/
        @token << c
      else
        pass_state[:remove, :repeat]
      end

    when :equal
      if c == '>' && lc == '='
        @token = ''; pop_state[] # TODO in pass helper
        if get_state[:hash]
          if nc == '=' || @refers_seen[-1]
            pass[:symbol, '=>']
          else
            pass[:refers, '=>']
            @refers_seen[-1] = true
          end
        else # MAYBE remove this <=> cheat
          pass[:symbol, '=>']
        end
      elsif c =~ /\S/
        @token << c
      else
        pass[:whitespace, c]
      end

    when :instructionsequence # RubyVM
      if c =~ /[^@]/i
        @token << c
      else
        pass[:object_line_prefix, @token + '@']
        @token = ''
        set_state[:object_line]
      end

    else
      raise "unknown state #{@state[-1]} #{@state.inspect}"
    end

    # next round
    if !@repeat
      i += 1
    elsif snapshot && Marshal.load(snapshot) == [@state, @token, llc, lc, c, nc] # loop protection
      raise "This might be a WIRB bug, please open an issue at:\nhttps://github.com/janlelis/wirb/issues/new"
    end

    snapshot = Marshal.dump([@state, @token, llc, lc, c, nc])
  end
end