String Matching

Brute-Force String Matching

The brute-force string matching algorithm is the simplest approach to solving the problem. It checks for the presence of the pattern at every possible position in the text.

Algorithm Steps

Iterate through the Text: For each position in the text, check if the pattern matches starting from that position.
Check for Match: Compare the characters of the pattern with the corresponding characters in the text.
Record Matches: If a match is found, record the starting index of the match.

Code Example:

/**
 * Perform brute-force string matching to find the pattern in the text.
 * @param {string} text - The text string.
 * @param {string} pattern - The pattern string.
 * @return {number[]} - The starting indices of occurrences of the pattern.
 */
const bruteForceSearch = (text, pattern) => {
  const result = [];
  const m = pattern.length;
  const n = text.length;

  for (let i = 0; i <= n - m; i++) {
    let j;
    for (j = 0; j < m; j++) {
      if (text[i + j] !== pattern[j]) break;
    }
    if (j === m) {
      result.push(i);
    }
  }
  
  return result;
};

Rabin Karp

function search(text, pattern) {
  const D = 256; // ASCII length
  const MOD = 1e9 + 7; // Prime modulus

  const N = text.length;
  const M = pattern.length;

  if (M === 0) return 0; // Edge case: empty pattern

  let p = 0; // Hash value for the pattern
  let t = 0; // Hash value for the text window
  const PREV = Math.pow(D, M - 1) % MOD; // Highest power of D (to remove the first character)

  // Calculate hash value for the pattern and the first window of text
  for (let i = 0; i < M; i++) {
    p = (D * p + pattern.charCodeAt(i)) % MOD;
    t = (D * t + text.charCodeAt(i)) % MOD;
  }

  // Slide the window over the text
  for (let i = 0; i <= N - M; i++) {
    // If the hash values match, check the characters one by one
    if (p === t) {
      let j = 0;
      while (j < M && pattern[j] === text[i + j]) j++; // Character comparison

      // If the full pattern matches, return the index
      if (j === M) {
        return i;
      }
    }

    // Update the hash for the next window
    if (i < N - M) {
      t = (D * (t - text.charCodeAt(i) * PREV) + text.charCodeAt(i + M)) % MOD;
      if (t < 0) t += MOD; // Handle negative values
    }
  }

  return -1; // Return -1 if the pattern is not found
}

Knuth-Morris-Pratt (KMP) Algorithm

The Knuth-Morris-Pratt (KMP) algorithm is an efficient string searching algorithm that finds occurrences of a "pattern" string within a "text" string. It improves upon the naive string matching algorithm by avoiding redundant comparisons.

Overview

The KMP algorithm uses information gained from previous comparisons to avoid redundant checking. It preprocesses the pattern to create a partial match table (also known as the "longest prefix suffix" or LPS array), which helps in skipping unnecessary comparisons when a mismatch occurs.

Algorithm Steps

Preprocess the Pattern: Construct the LPS array for the pattern.
Search for the Pattern: Use the LPS array to efficiently search for the pattern in the text.

Preprocessing the Pattern

The LPS array (Longest Prefix Suffix) stores the length of the longest proper prefix of the pattern which is also a suffix. This allows the algorithm to skip unnecessary comparisons in the search phase.

Code Example:

/**
 * Build the LPS (Longest Prefix Suffix) array for the pattern.
 * @param {string} pattern - The pattern string.
 * @return {number[]} - The LPS array.
 */
const buildLPSArray = (pattern) => {
  const lps = Array(pattern.length).fill(0);
  let length = 0;
  let i = 1;

  while (i < pattern.length) {
    if (pattern[i] === pattern[length]) {
      length++;
      lps[i] = length;
      i++;
    } else {
      if (length > 0) {
        length = lps[length - 1];
      } else {
        lps[i] = 0;
        i++;
      }
    }
  }
  
  return lps;
};

/**
 * Perform KMP search to find the pattern in the text.
 * @param {string} text - The text string.
 * @param {string} pattern - The pattern string.
 * @return {number[]} - The starting indices of occurrences of the pattern.
 */
const kmpSearch = (text, pattern) => {
  const lps = buildLPSArray(pattern);
  const result = [];
  let i = 0; // Index for text
  let j = 0; // Index for pattern

  while (i < text.length) {
    if (pattern[j] === text[i]) {
      i++;
      j++;
    }
    
    if (j === pattern.length) {
      result.push(i - j);
      j = lps[j - 1];
    } else if (i < text.length && pattern[j] !== text[i]) {
      if (j > 0) {
        j = lps[j - 1];
      } else {
        i++;
      }
    }
  }
  
  return result;
};

const text = "ababcababcabc";
const pattern = "abc";

console.log(kmpSearch(text, pattern)); // Output: [2, 7, 10]

Using Trie

class TrieNode {
  constructor() {
    this.children = {};
    this.isEndOfWord = false; // Marks the end of a pattern
  }
}

class Trie {
  constructor() {
    this.root = new TrieNode();
  }

  insert(word) {
    let currentNode = this.root;

    for (const char of word) {
      if (!currentNode.children[char]) {
        currentNode.children[char] = new TrieNode();
      }
      currentNode = currentNode.children[char];
    }
    currentNode.isEndOfWord = true; // Mark the end of the pattern
  }

  search(text) {
    const results = [];
    const n = text.length;

    for (let i = 0; i < n; i++) {
      let currentNode = this.root;

      for (let j = i; j < n && currentNode.children[text[j]]; j++) {
        currentNode = currentNode.children[text[j]];
        if (currentNode.isEndOfWord) {
          results.push({ pattern: text.substring(i, j + 1), index: i });
        }
      }
    }

    return results;
  }
}

// Example Usage
const trie = new Trie();
trie.insert('he');
trie.insert('she');
trie.insert('his');
trie.insert('hers');

const text = 'ushers';
const results = trie.search(text);
console.log(results); // Output: [{ pattern: 'she', index: 1 }, { pattern: 'hers', index: 3 }]
/* 
TC : O(N*K)
K is the maximum length of any pattern that has been inserted into the Trie.
where N is the length of the text.

SC : O(M+R)
M is the number of characters in all patterns in the Trie.
R is the number of matched patterns (or substrings) found during the search.
*/

Aho-Corasick Algorithm

Use Aho-Corasick when you need to search for multiple patterns in a single text, especially when those patterns can share common prefixes
Use KMP for single pattern searches where space efficiency is a concern or when the patterns are relatively small and do not need to be checked against a larger set

class Node {

  constructor() {
    this.outputs = new Set();
    this.children = new Map();
    this.failureLink = null;
  }

  hasChild(key) {
    return this.children.has(key);
  }

  getChild(key) {
    return this.children.get(key);
  }

  setChild(key, node) {
    this.children.set(key, node);
  }

  addOutput(output) {
    this.outputs.add(output);
  }

  copyOutputs(node) {
    for (const o of node.outputs) {
      this.outputs.add(o);
    }
  }

}

class AhoCorasick {

  constructor(patterns) {

    // construct the trie
    this.root = new Node();

    let currNode = this.root;
    for (const pattern of patterns) {
      for (let i = 0; i < pattern.length; i++) {
        const key = pattern[i];
        if (!currNode.hasChild(key)) {
          currNode.setChild(key, new Node());
        }
        currNode = currNode.getChild(key);
      }
      currNode.addOutput(pattern);
      currNode = this.root;
    }

    // failure link
    this.root.failureLink = this.root;
    const queue = [];
    for (const [_, child] of this.root.children) {
      child.failureLink = this.root;
      queue.push(child);
    }

    while (queue.length !== 0) {
      currNode = queue.shift();
      for (const [key, child] of currNode.children) {
        queue.push(child);

        let n = currNode.failureLink;
        while (!n.hasChild(key) && n != this.root) {
          n = n.failureLink;
        }
        child.failureLink = n.getChild(key) ?? this.root;
        child.copyOutputs(child.failureLink);

      }
    }
  }

  search(text) {
    const found = [];
    let state = this.root;
    let i = 0;
    while (i < text.length) {
      const c = text[i];
      if (state.hasChild(c)) {
        state = state.getChild(c);
        i++;
        if (state.outputs.size > 0) {
          state.outputs.forEach(val => {
            found.push({ "pos": i - val.length, "val": val });
          });
        }
      } else if (state === this.root) {
        i++;
      } else {
        state = state.failureLink;
      }
    }
    return found;
  }
}

// Test the Aho-Corasick implementation
function testAhoCorasick() {
  const patterns = ['he', 'she', 'his', 'hers'];
  const text = 'ushers';
  const ac = new AhoCorasick(patterns);
  const results = ac.search(text);

  results.forEach(result => {
    console.log(`Pattern: "${result.val}" found at position: ${result.pos}`);
  });
}

// Run the test
testAhoCorasick();

/* 
Pattern: "she" found at position: 1
Pattern: "he" found at position: 2
Pattern: "hers" found at position: 2
*/

/* 
TC : O(P+N+T)
P is the total length of all patterns
N is the number of nodes in the trie (which can be up to P).
T is the length of the text.

SC : O(N+M)
N is the number of nodes in the trie
M is the number of matches found in the text (this could be up to T, but usually M≤T)
*/

Brute-Force String Matching​

Algorithm Steps​

Rabin Karp​

Knuth-Morris-Pratt (KMP) Algorithm​

Overview​

Algorithm Steps​

Preprocessing the Pattern​

Using Trie​

Aho-Corasick Algorithm​

Brute-Force String Matching

Algorithm Steps

Rabin Karp

Knuth-Morris-Pratt (KMP) Algorithm

Overview

Algorithm Steps

Preprocessing the Pattern

Using Trie

Aho-Corasick Algorithm