import nltkfrom nltk.book import *nltk.corpus.gutenberg.fileids()emma = nltk.corpus.gutenberg.words('austen-emma.txt')len(emma)emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))emma.concordance("surprize")from nltk.corpus import gutenberggutenberg.fileids()emma = gutenberg.words('austen-emma.txt')for fileid in gutenberg.fileids(): num_char = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_char / num_words), int(num_words / num_sents), int(num_words / num_vocab), fileidmacbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') # 将文本划分为句子macbeth_sentences[1037]longest_len = max([len(s) for s in macbeth_sentences]) # 获得最长的句子[s for s in macbeth_sentences if len(s) == longest_len]from nltk.corpus import webtextfor fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65], '...' from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] from nltk.corpus import brownbrown.categories() # 得到文本的各分类类别brown.words(categories='news') # 指定特定的类别或文件阅读brown.words(fileids=['cg22'])brown.sents(categories=['news', 'editorial', 'reviews'])from nltk.corpus import brownnews_text = brown.words(categories='news') # 在新闻文体中的词sfdist = FreqDist([w.lower() for w in news_text]) # 化为字典形式,并略掉大小写modals = ['can', 'could', 'may', 'might', 'must', 'will']for m in modals: print m + ':', fdist[m], from nltk.corpus import brown import nltk cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories( ) for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] cfd.tabulate(conditions=genres, samples=modals) from nltk.corpus import reutersreuters.fileids() # 测试文档reuters.categories() # 路透社语料库的类别# 查找由一个或多个文档涵盖的主题,也可以查找包含在一个或多个类别中的文档。语料库方法既接受单个的fileid也接受fileids列表作为参数reuters.categories('traing/9865')reuters.categories(['traing/9865', 'traing/9880'])reuters.fileids('barley')reuters.fileids(['barley', 'corn'])# 可以以文档或类别为单位查找我们想要的词或句子reuters.words('traing/9865')[:14]reuters.words(['traing/9865', 'traing/9880'])reuters.words(categories='barley')reuters.words(categories=['barley', 'corn'])from nltk.corpus import inauguralinaugural.fileids()[fileid[:4] for fileid in inaugural.fileids()]cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target))cfd.plot()nltk.corpus.cess_esp.words()nltk.corpus.floresta.words()nltk.corpus.indian.words('hindi.pos')nltk.corpus.udhr.fileids()nltk.corpus.udhr.words('Javanese-Latin1')[11:]from nltk.corpus import udhrlanguages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1'))cfd.plot(cumulative=True)raw = gutenberg.raw("burgess-busterbrown.txt")raw[1:20]words = gutenberg.words("burgess-busterbrown.txt")words[1:20]sents = gutenberg.sents("burgess-busterbrown.txt")sents[1:20]# #http://blog.csdn.net/shanyuelanhua/article/details/51212194# from nltk.corpus import BracketParseCorpusReader# corpus_root =r"F:\nltk_data\corpora\SogouC.reduced.20061127\SogouC.reduced\Reduced" # r"" 防止转义# file_pattern = r".*/.*\.txt" #匹配corpus_root目录下的所有子目录下的txt文件# ptb = BracketParseCorpusReader(corpus_root, file_pattern) #初始化读取器:语料库目录和要加载文件的格式,默认utf8格式的编码# ptb.fileids() #至此,可以看到目录下的所有文件名,例如C000008/1001.txt,则成功了# ptb.raw(“C000008/1001.txt”) # 如果C000008/1001.txt编码格式和ptb格式一致,则看到内容# from nltk.corpus import PlaintextCorpusReader# corpus_root = r"F:\nltk_data\corpora\SogouC.reduced.20061127\SogouC.reduced\Reduced"# file_pattern = r"1001\.txt"# wordlists = PlaintextCorpusReader(corpus_root, file_pattern)# wordlists.fileids()# wordlists.words("1001.txt")from nltk.corpus import BracketParseCorpusReadercorpus_root = r"C:\Users\Tony\AppData\Roaming\nltk_data\SogouC.reduced\Reduced" # r"" 防止转义file_pattern = r".*/.*\.txt" # 匹配corpus_root目录下的所有子目录下的txt文件# 初始化读取器:语料库目录和要加载文件的格式,默认utf8格式的编码ptb = BracketParseCorpusReader(corpus_root, file_pattern)ptb.fileids() # 至此,可以看到目录下的所有文件名,例如C000008/1001.txt,则成功了ptb.raw(“C000008 / 1001.txt”) # 如果C000008/1001.txt编码格式和ptb格式一致,则看到内容from nltk.corpus import PlaintextCorpusReadercorpus_root = r"C:\Users\Tony\AppData\Roaming\nltk_data\SogouC.reduced\Reduced"file_pattern = r"1001\.txt"wordlists = PlaintextCorpusReader(corpus_root, file_pattern)wordlists.fileids()wordlists.words("1001.txt")from nltk.corpus import browncfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre))genre_word = [ (genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)]len(genre_word)genre_word[:4]genre_word[-4:]cfd = nltk.ConditionalFreqDist(genre_word)cfdcfd.conditions()cfd['news']cfd['romance']list(cfd['romance'])cfd['romance']['could']from nltk.corpus import inauguralcfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target))from nltk.corpus import udhrlanguages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages for word in udhr.words(lanng + '-Latin1'))cfd.tabulate( conditions=['English', 'German_Deutsch'], samples=range(10), cumulative=True)sent = ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven', 'and', 'the', 'earth', '.']nltk.bigrams(sent)def generate_model(cfdist, word, num=15): for i in range(num): print word, word = cfdist[word].max() text = nltk.corpus.genesis.words('english-kjv.txt') bigrams = nltk.bigrams(text) cfd = nltk.ConditionalFreqDist(bigrams) print cfd['living'] generate_model(cfd, 'living')# 模块的使用import syssys.path.append( r'C:\Users\Tony\Documents\Workspace\Python\NLP with Python\Chapter 2')form textproc import *plural("fairy")plural("woman")# textproc.pydef plural(word): if word.endswith('y'): return word[:-1] + 'ies' elif word[-1] in 'sx' or word[-2:] in ['sh', 'ch']: return word + 'es' elif word.endswith('an'): return word[:-2] + 'en' else: return word + 's'# 计算文本的词汇表,删除所有在现有的词汇列表出现的元素,只留下罕见的或拼写错误的词汇def unusual_words(text): text_vocab = set(w.lower() for w in text if w.isalpha()) english_vocab = set(w.lower() for w in nltk.corpus.words.words()) unusual = text_vocab.difference(english_vocab) return sorted(unusual) unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt')) unusual_words(nltk.corpus.nps_chat.words()) from nltk.corpus import stopwords stopwords.words('english')# 计算文本中不包含在停用词列表中的词所占的比例def content_fraction(text): stopwords = nltk.corpus.stopwords.words('english') content = [w for w in text if w.lower() not in stopwords] return len(content) / len(text) content_fraction(nltk.corpus.reuters.words())# 词谜puzzle_letters = nltk.FreqDist('egivrvonl')obligatory = 'r'wordlist = nltk.corpus.words.words()[w for w in wordlist if len(w) >= 6 and obligatory in w and nltk.FreqDist(w) <= puzzle_letters] # 利用FreqDist比较法检查候选词中的每个字母出现的频率是否小于或等于其相应在词谜中出现的概率# 同时出现在男性和女性文件中的名字names = nltk.corpus.namesnames.fileids()male_names = names.words('male.txt')female_names = names.words('female.txt')[w for w in male_names if w in female_names]# 男性和女性名字的结尾字母cfd = nltk.ConditionalFreqDist( (fileids, name[-1]) for fileids in names.fileids() for name in names.words(fileids))cfd.plot()# 发音的词典entries = nltk.corpus.cmudict.entries()len(entries)for entry in entries[39943:39951]: print entry# 扫描词典中发音包含三个音素的条目for word, pron in entries: if len(pron) == 3: ph1, ph2, ph3 = pron if ph1 == 'P' and ph3 == 'T': print word, ph2, syllable = ['N', 'IHO', 'K', 'S'][word for word, pron in entries if pron[-4:] == syllable] # ?# 以'n'结尾并发'M'的音的词汇[w for w, pron in entries if pron[-1] == 'M' and w[-1] == 'n']# 以'n'开头并发'N'的音的词汇sorted(set(w[:2] for w, pron in entries if pron[0] == 'N' and w[0] != 'n'))# 主重音(1)、次重音(2)、无重音(0)# 提取重音数字def stress(pron): return [char for phone in pron for char in phone if char.isdigit()]# 扫描字典,找到特定重音模式的词汇[w for w, pron in entries if stress(pron) == ['0', '1', '0', '2', '0']][w for w, pron in entries if stress(pron) == ['0', '2', '0', '1', '0']]# 使用条件频率分布寻找相应词汇的最小对比集p3 = [(pron[0] + '-' + pron[2], word) # 按照三音素词的第一个和最后一个音素来分组 for (word, pron) in entries if pron[0] == 'P' and len(pron) == 3] # 找到所哟p开头的三音素词 cfd = nltk.ConditionalFreqDist(p3) for template in cfd.conditions(): if len(cfd[template]) > 10: words = cfd[template].keys() wordlist = ' '.join(words) print template, wordlist[:70] + "..."# 通过特定词汇来访问它prondict = nltk.corpus.cmudict.dict()prondict['fire'] # 通过指定词典的名字及后面带方括号的关键字来查词典prondict['blog'] # 如果试图查找一个不存在的关键字,就会得到一个KeyErrorprondict['blog'] = [['B', 'L', 'AAl', 'G']]prondict['blog']# 文本到发音text = ['natural', 'language', 'processing'][ph for w in text for ph in prondict[w][0]]# 比较词表from nltk.corpus import swadeshswadesh.fileids()swadesh.words('en')# 通过使用entries()方法来指定一个语言链表来访问多语言中的同源词fr2en = swadesh.entries(['fr', 'en'])fr2entranslate = dict(fr2en)translate['chien']translate['jeter']# 使用dict()函数把德语-英语和西班牙语-英语对相互转换成一个词典,然后用这些添加的映射更新原有的translate词典de2en = swadesh.entries(['de', 'en']) # German -> Englishes2en = swadesh.entries(['es', 'en']) # Spanish -> Englishtranslate.update(dict(de2en))translate.update(dict(es2en))translate['Hund']translate['perro']# 比较德语族和拉丁语族的不同languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']for i in [139, 140, 141, 142]: print swadesh.entries(languages)[i]# 词汇工具 Toolbox和Shoeboxfrom nltk.corpus import toolboxtoolbox.entries('rotokas.dic')# WordNet# 意义与同义词from nltk.corpus import wordnet as wnwn.synsets('motorcar') # 定义同义词集wn.synset('car.n.01').lemma_names() # 访问同义词集wn.synset('car.n.01').definition() # 同义词集定义wn.synset('car.n.01').examples() # 同义词集例句wn.synset('car.n.01').lemmas() # 同义词集的所有词条wn.lemma('car.n.01.automobile') # 查找特定的词条wn.lemma('car.n.01.automobile').synset # 得到一个词条所对应的同义词集wn.lemma('car.n.01.automobile').name # 得到一个词条的名字wn.synsets('car') # car 共有5个同义词集for synset in wn.synsets('car'): print synset.lemma_names() print synset.definition() wn.synsets('dish') for synset in wn.synsets('dish'): print synset.definition()# 下位词motorcar = wn.synset('car.n.01')types_of_motorcar = motorcar.hyponyms()types_of_motorcar[26]sorted([lemma.name for synset in types_of_motorcar for lemma in synset.lemmas()])# 通过访问上位词来操纵层次结构。motorcar.hypernyms()paths = motorcar.hypernym_paths()len(paths)[synset.name for synset in paths[0]][synset.name for synset in paths[1]]# 得到一个最笼统的上位(或根上位)同义词集motorcar.root_hypernyms()# 图形化WordNet浏览器nltk.app.wordnet()# 更多的词汇关系wn.synset('tree.n.01').part_meronyms() # 条目到部分wn.synset('tree.n.01').substance_meronyms() # 条目的组成wn.synset('tree.n.01').member_holonyms() # 条目的集合for synset in wn.synsets('mint', wn.NOUN): print synset.name() + ':', synset.definition()wn.synset('mint.n.04').part_holonyms() # mint.n.04是mint.n.02的一部分,wn.synset('mint.n.04').substance_holonyms() # 同时也是组成mint.n.05的材料wn.synset('walk.v.01').entailments() # 走路蕴涵着抬脚wn.synset('eat.v.01').entailments()wn.synset('tease.v.03').entailments()# 反义词wn.lemma('supply.n.02.supply').antonyms()wn.lemma('rush.v.01.rush').antonyms()wn.lemma('horizontal.a.01.horizontal').antonyms()wn.lemma('staccato.r.01.staccato').antonyms()# 语义相似度right = wn.synset('right_whale.n.01')orca = wn.synset('orca.n.01')minke = wn.synset('minke_whale.n.01')tortoise = wn.synset('tortoise.n.01')novel = wn.synset('novel.n.01')right.lowest_common_hypernyms(minke)right.lowest_common_hypernyms(orca)right.lowest_common_hypernyms(tortoise)right.lowest_common_hypernyms(novel)# 通过查找每个同义词集的深度来量化这个普遍性的概念wn.synset('baleen_whale.n.01').min_depth()wn.synset('whale.n.02').min_depth()wn.synset('vertebrate.n.01').min_depth()wn.synset('entity.n.01').min_depth()# 基于上位词层次概念中相互关联的最短路径下,在0~1范围内的相似度right.path_similarity(minke)right.path_similarity(orca)right.path_similarity(tortoise)right.path_similarity(novel)# 帮助help(wn)# VerbNetnltk.corpus.verbnet