2025-05-19

aho-corasick substring search algorithm

links

minimal implementation in coffeescript

# create a trie node
make_node = -> children: {}, fail: null, outputs: []

# build the Aho-Corasick automaton
build_automaton = (keywords) ->
  root = make_node()

  # insert keywords into the trie
  for kw in keywords
    node = root
    node = node.children[ch] ?= make_node() for ch in kw
    node.outputs.push kw

  # set initial failure links (depth 1 children -> root)
  queue = (child for _, child of root.children)
  child.fail = root for child in queue

  # build failure links using BFS
  while queue.length
    node = queue.shift()
    for ch, child of node.children
      f = node.fail
      while f and not f.children[ch]
        f = f.fail
      child.fail = if f then f.children[ch] else root
      # merge outputs from fail link
      child.outputs = child.outputs.concat child.fail.outputs
      queue.push child

  # return a function to search text
  (text) ->
    node = root
    res = []
    for i in [0...text.length]
      ch = text[i]
      while node and not node.children[ch]
        node = node.fail
      node = if node then node.children[ch] else root
      # record matches ending at position i
      res.push [i - kw.length + 1, kw] for kw in node.outputs
    res

example usage

search = build_automaton ["he", "hers", "she", "his"]
console.log search "ushers"

the content of root as json

{
  "children": {
    "h": {
      "children": {
        "e": {
          "children": {
            "r": {
              "children": {
                "s": {
                  "children": {},
                  "fail": {
                    "children": {
                      "h": {
                        "children": {
                          "e": {
                            "children": {},
                            "outputs": [
                              "she",
                              "he"
                            ]
                          }
                        },
                        "outputs": []
                      }
                    },
                    "outputs": []
                  },
                  "outputs": [
                    "hers"
                  ]
                }
              },
              "outputs": []
            }
          },
          "outputs": [
            "he"
          ]
        },
        "i": {
          "children": {
            "s": {
              "children": {},
              "outputs": [
                "his"
              ]
            }
          },
          "outputs": []
        }
      },
      "outputs": []
    }
  },
  "fail": null,
  "outputs": []
}