2024-09-14

aho-corasick substring search algorithm

links

minimal implementation in coffeescript

class node_class
  constructor: ->
    @children = {}
    @fail = null
    @outputs = []

class state_machine_class
  constructor: (keywords) ->
    @root = new node_class()
    @build_trie keywords
    @build_failure()

  build_trie: (keywords) ->
    for keyword in keywords
      current = @root
      for char in keyword
        unless current.children[char]
          current.children[char] = new node_class()
        current = current.children[char]
      current.outputs.push keyword

  build_failure: ->
    queue = []
    for key, node of @root.children
      node.fail = @root
      queue.push node
    while queue.length > 0
      current = queue.shift()
      for key, child of current.children
        fail = current.fail
        while fail and not fail.children[key]
          fail = fail.fail
        child.fail = if fail then fail.children[key] else @root
        child.outputs = child.outputs.concat child.fail.outputs
        queue.push child

  search: (text) ->
    current = @root
    results = []
    for i in [0...text.length]
      char = text[i]
      while current and not current.children[char]
        current = current.fail
      current = if current then current.children[char] else @root
      if current.outputs.length > 0
        for output in current.outputs
          results.push [i - output.length + 1, output]
    results

search = (text, keywords) ->
  state_machine = new state_machine_class keywords
  console.log state_machine.search text

search "ushers", ["he", "hers", "she", "his"]