Natural Language Toolkit (NLTK) is a suite of libraries and programs for symbolic and statistical natural language processing (NLP) for the Python programming language.

Loading

import nltk

from nltk.book import *

texts()

text1

Content

text1.concordance("monstrous")

text1.similar("monstrous")

text2.similar("monstrous")

text2.common_contexts(["monstrous", "very"])

text4.dispersion_plot("citizens", "democracy", "freedom", "duties", "America")

text3.generate() # not currently working in v 3.0

Length and Sets

len(text3)

sorted(set(text3))

len(set(text3))

Lexical Diversity

from __future__ import division
len(text3) / len(set(text3)) # lexical richness

text3.count("smote")

100 * text4.count('a') / len(text4)

def lexical_diversity(text):
    return len(text) / len(set(text))

def percentage(word, text):
    return 100 * text.count(word) / len(text)

def percentage(word, text):
    return 100 * text.count(word) / len(text)

Concatenation

part1 = ['Far', 'out', 'in', 'the', 'uncharted', 'backwaters', 'of', 'the', 'unfashionable', 'end', 'of', 'the', 'western', 'spiral', 'arm', 'of', 'the', 'Galaxy']
part2 = ['lies', 'a', 'small', ',', 'unregarded', 'yellow', 'sun']

sent1 = part1 + part2

sent1.append('.')

Location

sent1[5]

sent1.index('Galaxy')

# slice
sent1[4:6]

# slice 0 to 5
sent1[:6]

# slice 6 to length
sent1[6:]

# last two words
text2[len(text2) - 2:]

# 'G' of 'Galaxy'
sent1[17][0]

# 'GGGGGG'
sent1[17][0] * 6

# 'Far Galaxy'
' '.join([sent1[0], sent1[17]])

# ['unregarded', 'yellow', 'sun', '.']
sent1[-4:]

Frequency Distribution

fdist1 = FreqDist(text1)
vocabulary1 = fdist1.keys()
vocabulary1[:50]

fdist1.plot(50, cumulative=True)

# words used once
fdist1.hapaxes()

Use set theory to find words longer than 15 chars

V = set(text1)
long_words = [w for w in V if len(w) > 15]
sorted(long_words)

def words_of_length(length, text):
    V = set(text)
    long_words = [w for w in V if len(w) > 15]
    return sorted(long_words)

fdist5 = FreqDist(text5)

Words in text 5 with more than 7 chars and appear more than 5 times

sorted([w for w in set(text5) if len(w) > 7 and fdist5[w] > 5])

Bigrams

from nltk import bigrams

# word pairs
b = bigrams(['more', 'is', 'said', 'than', 'done'])
list(b)

# frequently occurring bigrams
text4.collocations()

Frequency distribution

fdist = FreqDist([len(w) for w in text1])

# keys are word lengths
fdist.keys()

# keys and values
fdist.items()

# most frequent
fdist.max()

# number of words of three characters
fdist[3]

# percentage of text
fdist.freq(3)
fdist = FreqDist(text1)

# how many times does it occur
fdist['monstrous']

# frequency
fdist.freq('monstrous')

# number of samples
fdist.N()

fdist.keys()

for sample in sorted(fdist):
    print(str(fdist[sample]) + " - " + sample)

fdist.max()

fdist.tabulate()

fdist.plot()

fdist.plot(cumulative=True)

# if samples occur less frequently in 1 than in 2
fdist1 < fdist2