Latent Semantic Analysis with Python
In this project we will perform latent semantic analysis of large document sets.
We first create a document term matrix, and then perform SVD decomposition.
This document term matrix uses tf-idf weighting.
Loading Data
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.sparse import coo_matrix
from scipy.sparse import dok_matrix
import scipy.sparse.linalg as ssl
import scipy.sparse as scs
from tqdm import tqdm
%load_ext cython
First we’ll load in the data. Using the Jeopardy data as it’s small.
data = pd.read_csv('./data/JEOPARDY_CSV.csv')
data = data[:1000]
data.head()
Show Number | Air Date | Round | Category | Value | Question | Answer | |
---|---|---|---|---|---|---|---|
0 | 4680 | 2004-12-31 | Jeopardy! | HISTORY | $200 | For the last 8 years of his life, Galileo was ... | Copernicus |
1 | 4680 | 2004-12-31 | Jeopardy! | ESPN's TOP 10 ALL-TIME ATHLETES | $200 | No. 2: 1912 Olympian; football star at Carlisl... | Jim Thorpe |
2 | 4680 | 2004-12-31 | Jeopardy! | EVERYBODY TALKS ABOUT IT... | $200 | The city of Yuma in this state has a record av... | Arizona |
3 | 4680 | 2004-12-31 | Jeopardy! | THE COMPANY LINE | $200 | In 1963, live on "The Art Linkletter Show", th... | McDonald's |
4 | 4680 | 2004-12-31 | Jeopardy! | EPITAPHS & TRIBUTES | $200 | Signer of the Dec. of Indep., framer of the Co... | John Adams |
Here’s the full first row,
data.values[0]
array([4680, '2004-12-31', 'Jeopardy!', 'HISTORY', '$200',
"For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory",
'Copernicus'], dtype=object)
For our tf-idf matrix we’ll need to know how many documents there are
m = len(data)
Building the TF-IDF matrix
%%cython
import numpy as np
cimport numpy as np
import scipy.sparse as scs
from scipy.sparse import dok_matrix
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def unique_words(list sentences):
cdef dict words = {}
cdef int n = len(sentences)
cdef int i, j
for i in range(n):
sent_list = [w.lower() for w in sentences[i].split(' ')]
clean_sent_list = []
for j in range(len(sent_list)):
newword = ''
for char in sent_list[j]:
if char in alphabet:
newword += char
clean_sent_list.append(newword)
for word in clean_sent_list:
if word != '':
try:
words[word] += 1
except KeyError:
words[word] = 1
wordlist = sorted(words.keys())
return wordlist, len(wordlist), words
# Use tf-idf
# https://en.wikipedia.org/wiki/Tf%E2%80%93idf
def populate_doc_matrix(docmatrix, wordlist, word_freq, np.ndarray data):
cdef int n = len(data) # number of documents
cdef int i, j, k, m
# construct word index first
# This tells us (for any word) what index it is in in document
print('Constructing Word Reference')
wordref = {}
for i in range(len(wordlist)):
wordref[wordlist[i]] = i
# Now populate sparse matrix
print('Populating Sparse Matrix')
for i in range(n):
for j in range(2):
words = [w.lower() for w in data[i, j].split(' ') if w != '']
m = len(words)
for k in range(m):
word = words[k]
cword = ''
for char in word:
if char in alphabet:
cword += char
if cword != '':
docmatrix[i, wordref[cword]] += 1
# finish weighting
print('Weighting Matrix')
m, n = docmatrix.shape
weighted_docmatrix = dok_matrix((m, n), dtype=float)
for i in range(n):
weighted_docmatrix[:, i] = docmatrix[:, i] * np.log(m / word_freq[wordlist[i]])
return weighted_docmatrix, wordref
Building up some helper datasets
words, n, wordfreq = unique_words(list(np.concatenate((data[[' Question']].values[:, 0],
data[[' Answer']].values[:, 0]))))
Using these helper dictionaries we can track down the most frequent words,
print('{} Documents (m) by {} Unique Words (n)\n\nTop 100 Most Frequent Words:{}'.format(
m, n, ','.join([tup[0] for tup in sorted(wordfreq.items(), key=lambda tup: -tup[1])[:100]])))
1000 Documents (m) by 5244 Unique Words (n)
Top 100 Most Frequent Words:the,this,of,a,in,to,for,is,on,was,its,from,as,with,that,an,his,you,these,he,by,it,at,first,one,name,or,city,and,named,state,i,s,are,john,man,country,us,who,have,be,your,has,word,like,new,her,not,seen,called,when,hrefhttpwwwjarchivecommediadjjpg,had,out,were,here,about,can,clue,known,all,show,she,war,but,years,th,if,which,crew,make,now,film,made,wrote,series,may,type,island,more,used,area,than,began,queen,most,also,book,some,term,became,flag,said,part,river,youre,little,george,whose,him
And instead of storing this entire thing in memory we can use sparse matrix format
docmatrix = dok_matrix((m, n), dtype=float) # m-docs, n-unique words
ndocterm, wordref = populate_doc_matrix(docmatrix, words, wordfreq,
data[[' Question', ' Answer']].values)
Constructing Word Reference
Populating Sparse Matrix
Weighting Matrix
Once we have our TF-IDF matrix, we perform SVD decomposition
u, s, vt = ssl.svds(ndocterm.T, k=20)
u.shape, s.shape, vt.shape
((5244, 20), (20,), (20, 1000))
np.save('umatrix.npy', u)
np.save('smatrix.npy', s)
np.save('vtmatrix.npy', vt)
Now that we have our \(k\)th-order decomposition, let’s query the word “Species”.
wordref['species']
4384
Document #4384 is most-related to the word “Species”
Code
#!/usr/bin/env python3
"""
Perform LSA on given set of text.
See README.md for details
"""
import sys
import re
import math
import time
import argparse
import concurrent.futures
import numpy as np
import scipy.sparse.linalg as ssl
from scipy.sparse import dok_matrix
from scipy.sparse import dok
from tqdm import tqdm
import numba
import enforce
def main():
""" Manage Execution """
args = get_args()
with open(args.filename, 'r') as datafile:
lines = datafile.read().split('\n')
size = args.count
if size == -1:
size = len(lines)
documents = np.empty(size, dtype=object)
for i, line in enumerate(lines):
if i >= size:
break
documents[i] = line
doccount = len(documents)
print('Program Start. Loaded Data. Time Elapsed: {}\n'.format(time.clock()))
words = get_unique_words(documents, args.workers)
wordcount = len(words.keys())
topwords = ','.join([w for w, s in sorted(words.items(),
key=lambda tup: -tup[1]['freq'])[:100]])
print(('Found Word Frequencies\n'
'\n{} Documents (m) by {} Unique Words (n)\n\n'
'Top 100 Most Frequent Words:{}\n'
'Time Elapsed: {}\n').format(doccount,
wordcount,
topwords,
time.clock()))
docmatrix = get_sparse_matrix(documents, words, args.workers)
print('Calculated Sparse Matrix\nTime Elapsed: {}\n'.format(time.clock()))
u, s, vt = ssl.svds(docmatrix, k=args.svdk)
print('Calculated SVD Decomposition\nTime Elapsed: {}'.format(time.clock()))
@enforce.runtime_validation
def get_unique_words(documents: np.ndarray, workers: int) -> dict:
"""
Parallelize Unique Word Calculation
:documents: list of document strings
:workers: number of workers
:return: dictionary of word frequencies
"""
data_bins = np.array_split(documents, workers)
wordlist = {}
with concurrent.futures.ProcessPoolExecutor() as executor:
futures = {executor.submit(unique_words, data_bins[i]):i for i in range(workers)}
for future in tqdm(concurrent.futures.as_completed(futures),
desc='Determining Unique Words', leave=True, total=workers):
for word, stats in future.result().items():
try:
wordlist[word]['freq'] += stats['freq']
wordlist[word]['doccount'] += stats['doccount']
except KeyError:
wordlist[word] = {'freq':stats['freq'], 'doccount':stats['doccount']}
return wordlist
@enforce.runtime_validation
def unique_words(data: np.ndarray) -> dict:
"""
Finds unique word frequencies in documents
:data: list of document strings
:return: dictionary of word frequencies
"""
words = {}
olddoc = None
for doc in data:
for word in doc.split(' '):
cword = re.sub('[^a-z]+', '', word.lower())
if cword != '':
try:
words[cword]['freq'] += 1
if doc != olddoc:
words[cword]['doccount'] += 1
except KeyError:
words[cword] = {'freq':1, 'doccount':1}
olddoc = doc
return words
@enforce.runtime_validation
def get_sparse_matrix(documents: np.ndarray, words: dict, workers: int) -> dok.dok_matrix:
"""
Parallelize Sparse Matrix Calculation
:documents: list of document strings
:words: dictionary of word frequencies
:workers: number of workers
:return: Sparse document term matrix
"""
m = len(documents)
n = len(words.keys())
data_bins = np.array_split(documents, workers)
docmatrix = dok_matrix((m, n), dtype=float)
with concurrent.futures.ProcessPoolExecutor() as executor:
futures = {executor.submit(parse_docs, data_bins[i], words, len(documents)):i
for i in range(workers)}
for future in tqdm(concurrent.futures.as_completed(futures),
desc='Parsing Documents and Combining Arrays',
leave=True, total=workers):
# THIS IS THE BOTTLENECK
for key, value in future.result().items():
docmatrix[key[0], key[1]] = value
return docmatrix
@enforce.runtime_validation
def parse_docs(data: np.ndarray, words: dict, total_doc_count: int) -> dict:
"""
Parallelize Sparse Matrix Calculation
:data: list of document strings
:words: dictionary of word frequencies
:total_doc_count: total number of documents (for tf-idf)
:return: Basically sparse array with weighted values
"""
m = len(data)
n = len(words.keys())
docmatrix = {}
wordref = {w:i for i, w in enumerate(sorted(words.keys()))}
for i, doc in enumerate(data):
for word in list(set([re.sub('[^a-z]+', '', w.lower()) for w in doc.split(' ')])):
if word != '':
docmatrix[(i, wordref[word])] = weight(total_doc_count,
words[word]['doccount'],
words[word]['freq'])
return docmatrix
@numba.jit
def weight(total_doc_count: int, doccount: int, wordfreq: int) -> float:
"""
Weighting function for Document Term Matrix.
tf-idf => https://en.wikipedia.org/wiki/Tf%E2%80%93idf
"""
return math.log(total_doc_count / doccount) * wordfreq
@enforce.runtime_validation
def get_args() -> argparse.Namespace:
"""
Get Command line Arguments
:return: args
"""
parser = argparse.ArgumentParser()
parser.add_argument('-w', '--workers', type=int, default=32,
help=('Number of workers to use for multiprocessing'))
parser.add_argument('-c', '--count', type=int, default=-1,
help=('Number of documents to use from original set'))
parser.add_argument('-k', '--svdk', type=int, default=20,
help=('SVD Degree'))
parser.add_argument('-f', '--filename', type=str, default='./data/jeopardy.csv',
help=('File to use for analysis'))
args = parser.parse_args()
return args
if __name__ == '__main__':
sys.exit(main())