Natural Language Toolkit (NLTK) is a suite of libraries and programs for symbolic and statistical natural language processing (NLP) for the Python programming language.
Loading
import nltk
from nltk.book import *
texts ()
text1
Content
text1 . concordance ( "monstrous" )
text1 . similar ( "monstrous" )
text2 . similar ( "monstrous" )
text2 . common_contexts ([ "monstrous" , "very" ])
text4 . dispersion_plot ( "citizens" , "democracy" , "freedom" , "duties" , "America" )
text3 . generate () # not currently working in v 3.0
Length and Sets
len ( text3 )
sorted ( set ( text3 ))
len ( set ( text3 ))
Lexical Diversity
from __future__ import division
len ( text3 ) / len ( set ( text3 )) # lexical richness
text3 . count ( "smote" )
100 * text4 . count ( 'a' ) / len ( text4 )
def lexical_diversity ( text ):
return len ( text ) / len ( set ( text ))
def percentage ( word , text ):
return 100 * text . count ( word ) / len ( text )
def percentage ( word , text ):
return 100 * text . count ( word ) / len ( text )
Concatenation
part1 = [ 'Far' , 'out' , 'in' , 'the' , 'uncharted' , 'backwaters' , 'of' , 'the' , 'unfashionable' , 'end' , 'of' , 'the' , 'western' , 'spiral' , 'arm' , 'of' , 'the' , 'Galaxy' ]
part2 = [ 'lies' , 'a' , 'small' , ',' , 'unregarded' , 'yellow' , 'sun' ]
sent1 = part1 + part2
sent1 . append ( '.' )
Location
sent1 [ 5 ]
sent1 . index ( 'Galaxy' )
# slice
sent1 [ 4 : 6 ]
# slice 0 to 5
sent1 [: 6 ]
# slice 6 to length
sent1 [ 6 :]
# last two words
text2 [ len ( text2 ) - 2 :]
# 'G' of 'Galaxy'
sent1 [ 17 ][ 0 ]
# 'GGGGGG'
sent1 [ 17 ][ 0 ] * 6
# 'Far Galaxy'
' ' . join ([ sent1 [ 0 ], sent1 [ 17 ]])
# ['unregarded', 'yellow', 'sun', '.']
sent1 [ - 4 :]
Frequency Distribution
fdist1 = FreqDist ( text1 )
vocabulary1 = fdist1 . keys ()
vocabulary1 [: 50 ]
fdist1 . plot ( 50 , cumulative = True )
# words used once
fdist1 . hapaxes ()
Use set theory to find words longer than 15 chars
V = set ( text1 )
long_words = [ w for w in V if len ( w ) > 15 ]
sorted ( long_words )
def words_of_length ( length , text ):
V = set ( text )
long_words = [ w for w in V if len ( w ) > 15 ]
return sorted ( long_words )
fdist5 = FreqDist ( text5 )
Words in text 5 with more than 7 chars and appear more than 5 times
sorted ([ w for w in set ( text5 ) if len ( w ) > 7 and fdist5 [ w ] > 5 ])
Bigrams
from nltk import bigrams
# word pairs
b = bigrams ([ 'more' , 'is' , 'said' , 'than' , 'done' ])
list ( b )
# frequently occurring bigrams
text4 . collocations ()
Frequency distribution
fdist = FreqDist ([ len ( w ) for w in text1 ])
# keys are word lengths
fdist . keys ()
# keys and values
fdist . items ()
# most frequent
fdist . max ()
# number of words of three characters
fdist [ 3 ]
# percentage of text
fdist . freq ( 3 )
fdist = FreqDist ( text1 )
# how many times does it occur
fdist [ 'monstrous' ]
# frequency
fdist . freq ( 'monstrous' )
# number of samples
fdist . N ()
fdist . keys ()
for sample in sorted ( fdist ):
print ( str ( fdist [ sample ]) + " - " + sample )
fdist . max ()
fdist . tabulate ()
fdist . plot ()
fdist . plot ( cumulative = True )
# if samples occur less frequently in 1 than in 2
fdist1 < fdist2