WITH text_input(id, text) AS (
VALUES
(1, 'The quick brown fox jumps over the lazy dog'),
(2, 'The quick and the lazy'),
(3, 'The quick brown fox jumps over the dog')
),
text_ngrams(id, ngrams) AS (
SELECT id,
transform(
ngrams(
split(text, ' '),
4
),
token -> array_join(token, ' ')
)
FROM text_input
),
minhash_digest(id, digest) AS (
SELECT id,
(SELECT make_set_digest(v) FROM unnest(ngrams) u(v))
FROM text_ngrams
),
setdigest_side_by_side(id1, digest1, id2, digest2) AS (
SELECT m1.id as id1,
m1.digest as digest1,
m2.id as id2,
m2.digest as digest2
FROM (SELECT id, digest FROM minhash_digest) m1
JOIN (SELECT id, digest FROM minhash_digest) m2
ON m1.id != m2.id AND m1.id < m2.id
)
SELECT id1,
id2,
intersection_cardinality(digest1, digest2) AS intersection_cardinality,
jaccard_index(digest1, digest2) AS jaccard_index
FROM setdigest_side_by_side
ORDER BY id1, id2;