Suffix Array — O(n log n) Construction and Applications

Sanjeev SharmaSanjeev Sharma
2 min read

Advertisement

Suffix Array

Sort all suffixes of a string lexicographically. sa[i] = starting index of the i-th lexicographically smallest suffix.

s = "banana"
suffixes:  banana, anana, nana, ana, na, a
sorted:    a, ana, anana, banana, na, nana
sa:        [5, 3, 1, 0, 4, 2]

O(n log²n) Construction (Doubling)

def build_suffix_array(s):
    n = len(s)
    sa = list(range(n))
    rank = [ord(c) for c in s]
    tmp = [0]*n
    k = 1
    while k < n:
        def key(i):
            return (rank[i], rank[i+k] if i+k < n else -1)
        sa.sort(key=key)
        tmp[sa[0]] = 0
        for i in range(1, n):
            tmp[sa[i]] = tmp[sa[i-1]] + (key(sa[i]) != key(sa[i-1]))
        rank = tmp[:]
        if rank[sa[-1]] == n-1: break
        k *= 2
    return sa

Python

def build_suffix_array(s):
    n = len(s)
    sa = sorted(range(n), key=lambda i: s[i:])
    return sa

def build_lcp(s, sa):
    n = len(s)
    rank = [0]*n
    for i, suf in enumerate(sa): rank[suf] = i
    lcp = [0]*n
    h = 0
    for i in range(n):
        if rank[i] > 0:
            j = sa[rank[i]-1]
            while i+h < n and j+h < n and s[i+h] == s[j+h]:
                h += 1
            lcp[rank[i]] = h
            if h: h -= 1
    return lcp

JavaScript

function buildSuffixArray(s){
    const n=s.length;
    return Array.from({length:n},(_,i)=>i).sort((a,b)=>s.slice(a)<s.slice(b)?-1:1);
}

Java

public int[] buildSuffixArray(String s) {
    int n=s.length();
    Integer[]sa=new Integer[n]; for(int i=0;i<n;i++)sa[i]=i;
    Arrays.sort(sa,(a,b)->s.substring(a).compareTo(s.substring(b)));
    return Arrays.stream(sa).mapToInt(Integer::intValue).toArray();
}

C++

#include <vector>
#include <string>
#include <algorithm>
using namespace std;
vector<int> buildSuffixArray(string s){
    int n=s.size(); vector<int>sa(n);
    iota(sa.begin(),sa.end(),0);
    sort(sa.begin(),sa.end(),[&](int a,int b){return s.substr(a)<s.substr(b);});
    return sa;
}

C

/* C: qsort with strcmp on suffix pointers */

Application: Number of Distinct Substrings

def countDistinctSubstrings(s):
    n = len(s)
    sa = build_suffix_array(s)
    lcp = build_lcp(s, sa)
    total = n*(n+1)//2
    return total - sum(lcp)

Advertisement

Sanjeev Sharma

Written by

Sanjeev Sharma

Full Stack Engineer · E-mopro