Natural Language Toolkit Cheat Sheet
Natural Language Toolkit (NLTK) is a suite of libraries and programs for symbolic and statistical natural language processing (NLP) for the Python programming language.
Loading
import nltk
from nltk.book import *
texts()
text1
Content
text1.concordance("monstrous")
text1.similar("monstrous")
text2.similar("monstrous")
text2.common_contexts(["monstrous", "very"])
text4.dispersion_plot("citizens", "democracy", "freedom", "duties", "America")
text3.generate() # not currently working in v 3.0
Length and Sets
len(text3)
sorted(set(text3))
len(set(text3))
Lexical Diversity
from __future__ import division
len(text3) / len(set(text3)) # lexical richness
text3.count("smote")
100 * text4.count('a') / len(text4)
def lexical_diversity(text):
return len(text) / len(set(text))
def percentage(word, text):
return 100 * text.count(word) / len(text)
def percentage(word, text):
return 100 * text.count(word) / len(text)
Concatenation
part1 = ['Far', 'out', 'in', 'the', 'uncharted', 'backwaters', 'of', 'the', 'unfashionable', 'end', 'of', 'the', 'western', 'spiral', 'arm', 'of', 'the', 'Galaxy']
part2 = ['lies', 'a', 'small', ',', 'unregarded', 'yellow', 'sun']
sent1 = part1 + part2
sent1.append('.')
Location
sent1[5]
sent1.index('Galaxy')
# slice
sent1[4:6]
# slice 0 to 5
sent1[:6]
# slice 6 to length
sent1[6:]
# last two words
text2[len(text2) - 2:]
# 'G' of 'Galaxy'
sent1[17][0]
# 'GGGGGG'
sent1[17][0] * 6
# 'Far Galaxy'
' '.join([sent1[0], sent1[17]])
# ['unregarded', 'yellow', 'sun', '.']
sent1[-4:]
Frequency Distribution
fdist1 = FreqDist(text1)
vocabulary1 = fdist1.keys()
vocabulary1[:50]
fdist1.plot(50, cumulative=True)
# words used once
fdist1.hapaxes()
Use set theory to find words longer than 15 chars
V = set(text1)
long_words = [w for w in V if len(w) > 15]
sorted(long_words)
def words_of_length(length, text):
V = set(text)
long_words = [w for w in V if len(w) > 15]
return sorted(long_words)
fdist5 = FreqDist(text5)
Words in text 5 with more than 7 chars and appear more than 5 times
sorted([w for w in set(text5) if len(w) > 7 and fdist5[w] > 5])
Bigrams
from nltk import bigrams
# word pairs
b = bigrams(['more', 'is', 'said', 'than', 'done'])
list(b)
# frequently occurring bigrams
text4.collocations()
Frequency distribution
fdist = FreqDist([len(w) for w in text1])
# keys are word lengths
fdist.keys()
# keys and values
fdist.items()
# most frequent
fdist.max()
# number of words of three characters
fdist[3]
# percentage of text
fdist.freq(3)
fdist = FreqDist(text1)
# how many times does it occur
fdist['monstrous']
# frequency
fdist.freq('monstrous')
# number of samples
fdist.N()
fdist.keys()
for sample in sorted(fdist):
print(str(fdist[sample]) + " - " + sample)
fdist.max()
fdist.tabulate()
fdist.plot()
fdist.plot(cumulative=True)
# if samples occur less frequently in 1 than in 2
fdist1 < fdist2