博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
《Python自然语言处理》第二章 学习笔记
阅读量:6892 次
发布时间:2019-06-27

本文共 12787 字,大约阅读时间需要 42 分钟。

import nltkfrom nltk.book import *nltk.corpus.gutenberg.fileids()emma = nltk.corpus.gutenberg.words('austen-emma.txt')len(emma)emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))emma.concordance("surprize")from nltk.corpus import gutenberggutenberg.fileids()emma = gutenberg.words('austen-emma.txt')for fileid in gutenberg.fileids():    num_char = len(gutenberg.raw(fileid))    num_words = len(gutenberg.words(fileid))    num_sents = len(gutenberg.sents(fileid))    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))    print int(num_char / num_words), int(num_words / num_sents), int(num_words / num_vocab), fileidmacbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')  # 将文本划分为句子macbeth_sentences[1037]longest_len = max([len(s) for s in macbeth_sentences])  # 获得最长的句子[s for s in macbeth_sentences if len(s) == longest_len]from nltk.corpus import webtextfor fileid in webtext.fileids():    print fileid, webtext.raw(fileid)[:65], '...'    from nltk.corpus import nps_chat    chatroom = nps_chat.posts('10-19-20s_706posts.xml')    chatroom[123]    from nltk.corpus import brownbrown.categories()  # 得到文本的各分类类别brown.words(categories='news')  # 指定特定的类别或文件阅读brown.words(fileids=['cg22'])brown.sents(categories=['news', 'editorial', 'reviews'])from nltk.corpus import brownnews_text = brown.words(categories='news')  # 在新闻文体中的词sfdist = FreqDist([w.lower() for w in news_text])  # 化为字典形式,并略掉大小写modals = ['can', 'could', 'may', 'might', 'must', 'will']for m in modals:    print m + ':', fdist[m],    from nltk.corpus import brown    import nltk    cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories(    ) for word in brown.words(categories=genre))    genres = ['news', 'religion', 'hobbies',              'science_fiction', 'romance', 'humor']    modals = ['can', 'could', 'may', 'might', 'must', 'will']    cfd.tabulate(conditions=genres, samples=modals)    from nltk.corpus import reutersreuters.fileids()  # 测试文档reuters.categories()  # 路透社语料库的类别# 查找由一个或多个文档涵盖的主题,也可以查找包含在一个或多个类别中的文档。语料库方法既接受单个的fileid也接受fileids列表作为参数reuters.categories('traing/9865')reuters.categories(['traing/9865', 'traing/9880'])reuters.fileids('barley')reuters.fileids(['barley', 'corn'])# 可以以文档或类别为单位查找我们想要的词或句子reuters.words('traing/9865')[:14]reuters.words(['traing/9865', 'traing/9880'])reuters.words(categories='barley')reuters.words(categories=['barley', 'corn'])from nltk.corpus import inauguralinaugural.fileids()[fileid[:4] for fileid in inaugural.fileids()]cfd = nltk.ConditionalFreqDist((target, fileid[:4])                               for fileid in inaugural.fileids()                               for w in inaugural.words(fileid)                               for target in ['america', 'citizen']                               if w.lower().startswith(target))cfd.plot()nltk.corpus.cess_esp.words()nltk.corpus.floresta.words()nltk.corpus.indian.words('hindi.pos')nltk.corpus.udhr.fileids()nltk.corpus.udhr.words('Javanese-Latin1')[11:]from nltk.corpus import udhrlanguages = ['Chickasaw', 'English', 'German_Deutsch',             'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']cfd = nltk.ConditionalFreqDist(    (lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1'))cfd.plot(cumulative=True)raw = gutenberg.raw("burgess-busterbrown.txt")raw[1:20]words = gutenberg.words("burgess-busterbrown.txt")words[1:20]sents = gutenberg.sents("burgess-busterbrown.txt")sents[1:20]# #http://blog.csdn.net/shanyuelanhua/article/details/51212194# from nltk.corpus import BracketParseCorpusReader# corpus_root =r"F:\nltk_data\corpora\SogouC.reduced.20061127\SogouC.reduced\Reduced"  # r"" 防止转义# file_pattern = r".*/.*\.txt"    #匹配corpus_root目录下的所有子目录下的txt文件# ptb = BracketParseCorpusReader(corpus_root, file_pattern)   #初始化读取器:语料库目录和要加载文件的格式,默认utf8格式的编码# ptb.fileids()   #至此,可以看到目录下的所有文件名,例如C000008/1001.txt,则成功了# ptb.raw(“C000008/1001.txt”) # 如果C000008/1001.txt编码格式和ptb格式一致,则看到内容# from nltk.corpus import PlaintextCorpusReader# corpus_root = r"F:\nltk_data\corpora\SogouC.reduced.20061127\SogouC.reduced\Reduced"# file_pattern = r"1001\.txt"# wordlists = PlaintextCorpusReader(corpus_root, file_pattern)# wordlists.fileids()# wordlists.words("1001.txt")from nltk.corpus import BracketParseCorpusReadercorpus_root = r"C:\Users\Tony\AppData\Roaming\nltk_data\SogouC.reduced\Reduced"  # r"" 防止转义file_pattern = r".*/.*\.txt"  # 匹配corpus_root目录下的所有子目录下的txt文件# 初始化读取器:语料库目录和要加载文件的格式,默认utf8格式的编码ptb = BracketParseCorpusReader(corpus_root, file_pattern)ptb.fileids()  # 至此,可以看到目录下的所有文件名,例如C000008/1001.txt,则成功了ptb.raw(“C000008 / 1001.txt”)  # 如果C000008/1001.txt编码格式和ptb格式一致,则看到内容from nltk.corpus import PlaintextCorpusReadercorpus_root = r"C:\Users\Tony\AppData\Roaming\nltk_data\SogouC.reduced\Reduced"file_pattern = r"1001\.txt"wordlists = PlaintextCorpusReader(corpus_root, file_pattern)wordlists.fileids()wordlists.words("1001.txt")from nltk.corpus import browncfd = nltk.ConditionalFreqDist(    (genre, word)    for genre in brown.categories()    for word in brown.words(categories=genre))genre_word = [    (genre, word)    for genre in ['news', 'romance']    for word in brown.words(categories=genre)]len(genre_word)genre_word[:4]genre_word[-4:]cfd = nltk.ConditionalFreqDist(genre_word)cfdcfd.conditions()cfd['news']cfd['romance']list(cfd['romance'])cfd['romance']['could']from nltk.corpus import inauguralcfd = nltk.ConditionalFreqDist(    (target, fileid[:4])    for fileid in inaugural.fileids()    for w in inaugural.words(fileid)    for target in ['america', 'citizen']    if w.lower().startswith(target))from nltk.corpus import udhrlanguages = ['Chickasaw', 'English', 'German_Deutsch',             'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']cfd = nltk.ConditionalFreqDist(    (lang, len(word))    for lang in languages    for word in udhr.words(lanng + '-Latin1'))cfd.tabulate(    conditions=['English', 'German_Deutsch'],    samples=range(10),    cumulative=True)sent = ['In', 'the', 'beginning', 'God', 'created',        'the', 'heaven', 'and', 'the', 'earth', '.']nltk.bigrams(sent)def generate_model(cfdist, word, num=15):    for i in range(num):        print word,        word = cfdist[word].max()        text = nltk.corpus.genesis.words('english-kjv.txt')        bigrams = nltk.bigrams(text)        cfd = nltk.ConditionalFreqDist(bigrams)        print cfd['living']        generate_model(cfd, 'living')# 模块的使用import syssys.path.append(    r'C:\Users\Tony\Documents\Workspace\Python\NLP with Python\Chapter 2')form textproc import *plural("fairy")plural("woman")# textproc.pydef plural(word):    if word.endswith('y'):        return word[:-1] + 'ies'    elif word[-1] in 'sx' or word[-2:] in ['sh', 'ch']:        return word + 'es'    elif word.endswith('an'):        return word[:-2] + 'en'    else:        return word + 's'# 计算文本的词汇表,删除所有在现有的词汇列表出现的元素,只留下罕见的或拼写错误的词汇def unusual_words(text):    text_vocab = set(w.lower() for w in text if w.isalpha())    english_vocab = set(w.lower() for w in nltk.corpus.words.words())    unusual = text_vocab.difference(english_vocab)    return sorted(unusual)    unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))    unusual_words(nltk.corpus.nps_chat.words())    from nltk.corpus import stopwords    stopwords.words('english')# 计算文本中不包含在停用词列表中的词所占的比例def content_fraction(text):    stopwords = nltk.corpus.stopwords.words('english')    content = [w for w in text if w.lower() not in stopwords]    return len(content) / len(text)    content_fraction(nltk.corpus.reuters.words())# 词谜puzzle_letters = nltk.FreqDist('egivrvonl')obligatory = 'r'wordlist = nltk.corpus.words.words()[w for w in wordlist if len(w) >= 6 and obligatory in w    and nltk.FreqDist(w) <= puzzle_letters]  # 利用FreqDist比较法检查候选词中的每个字母出现的频率是否小于或等于其相应在词谜中出现的概率# 同时出现在男性和女性文件中的名字names = nltk.corpus.namesnames.fileids()male_names = names.words('male.txt')female_names = names.words('female.txt')[w for w in male_names if w in female_names]# 男性和女性名字的结尾字母cfd = nltk.ConditionalFreqDist(    (fileids, name[-1])    for fileids in names.fileids()    for name in names.words(fileids))cfd.plot()# 发音的词典entries = nltk.corpus.cmudict.entries()len(entries)for entry in entries[39943:39951]:    print entry# 扫描词典中发音包含三个音素的条目for word, pron in entries:    if len(pron) == 3:        ph1, ph2, ph3 = pron        if ph1 == 'P' and ph3 == 'T':            print word, ph2,            syllable = ['N', 'IHO', 'K', 'S'][word for word, pron in entries if pron[-4:] == syllable]  # ?# 以'n'结尾并发'M'的音的词汇[w for w, pron in entries if pron[-1] == 'M' and w[-1] == 'n']# 以'n'开头并发'N'的音的词汇sorted(set(w[:2] for w, pron in entries if pron[0] == 'N' and w[0] != 'n'))# 主重音(1)、次重音(2)、无重音(0)# 提取重音数字def stress(pron):    return [char for phone in pron for char in phone if char.isdigit()]# 扫描字典,找到特定重音模式的词汇[w for w, pron in entries if stress(pron) == ['0', '1', '0', '2', '0']][w for w, pron in entries if stress(pron) == ['0', '2', '0', '1', '0']]# 使用条件频率分布寻找相应词汇的最小对比集p3 = [(pron[0] + '-' + pron[2], word)  # 按照三音素词的第一个和最后一个音素来分组      for (word, pron) in entries      if pron[0] == 'P' and len(pron) == 3]  # 找到所哟p开头的三音素词    cfd = nltk.ConditionalFreqDist(p3)    for template in cfd.conditions():        if len(cfd[template]) > 10:            words = cfd[template].keys()            wordlist = ' '.join(words)            print template, wordlist[:70] + "..."# 通过特定词汇来访问它prondict = nltk.corpus.cmudict.dict()prondict['fire']  # 通过指定词典的名字及后面带方括号的关键字来查词典prondict['blog']  # 如果试图查找一个不存在的关键字,就会得到一个KeyErrorprondict['blog'] = [['B', 'L', 'AAl', 'G']]prondict['blog']# 文本到发音text = ['natural', 'language', 'processing'][ph for w in text for ph in prondict[w][0]]# 比较词表from nltk.corpus import swadeshswadesh.fileids()swadesh.words('en')# 通过使用entries()方法来指定一个语言链表来访问多语言中的同源词fr2en = swadesh.entries(['fr', 'en'])fr2entranslate = dict(fr2en)translate['chien']translate['jeter']# 使用dict()函数把德语-英语和西班牙语-英语对相互转换成一个词典,然后用这些添加的映射更新原有的translate词典de2en = swadesh.entries(['de', 'en'])  # German -> Englishes2en = swadesh.entries(['es', 'en'])  # Spanish -> Englishtranslate.update(dict(de2en))translate.update(dict(es2en))translate['Hund']translate['perro']# 比较德语族和拉丁语族的不同languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']for i in [139, 140, 141, 142]:    print swadesh.entries(languages)[i]# 词汇工具 Toolbox和Shoeboxfrom nltk.corpus import toolboxtoolbox.entries('rotokas.dic')# WordNet# 意义与同义词from nltk.corpus import wordnet as wnwn.synsets('motorcar')  # 定义同义词集wn.synset('car.n.01').lemma_names()  # 访问同义词集wn.synset('car.n.01').definition()  # 同义词集定义wn.synset('car.n.01').examples()  # 同义词集例句wn.synset('car.n.01').lemmas()  # 同义词集的所有词条wn.lemma('car.n.01.automobile')  # 查找特定的词条wn.lemma('car.n.01.automobile').synset  # 得到一个词条所对应的同义词集wn.lemma('car.n.01.automobile').name  # 得到一个词条的名字wn.synsets('car')  # car 共有5个同义词集for synset in wn.synsets('car'):    print synset.lemma_names()    print synset.definition()    wn.synsets('dish')    for synset in wn.synsets('dish'):        print synset.definition()# 下位词motorcar = wn.synset('car.n.01')types_of_motorcar = motorcar.hyponyms()types_of_motorcar[26]sorted([lemma.name for synset in types_of_motorcar for lemma in synset.lemmas()])# 通过访问上位词来操纵层次结构。motorcar.hypernyms()paths = motorcar.hypernym_paths()len(paths)[synset.name for synset in paths[0]][synset.name for synset in paths[1]]# 得到一个最笼统的上位(或根上位)同义词集motorcar.root_hypernyms()# 图形化WordNet浏览器nltk.app.wordnet()# 更多的词汇关系wn.synset('tree.n.01').part_meronyms()  # 条目到部分wn.synset('tree.n.01').substance_meronyms()  # 条目的组成wn.synset('tree.n.01').member_holonyms()  # 条目的集合for synset in wn.synsets('mint', wn.NOUN):    print synset.name() + ':', synset.definition()wn.synset('mint.n.04').part_holonyms()  # mint.n.04是mint.n.02的一部分,wn.synset('mint.n.04').substance_holonyms()  # 同时也是组成mint.n.05的材料wn.synset('walk.v.01').entailments()  # 走路蕴涵着抬脚wn.synset('eat.v.01').entailments()wn.synset('tease.v.03').entailments()# 反义词wn.lemma('supply.n.02.supply').antonyms()wn.lemma('rush.v.01.rush').antonyms()wn.lemma('horizontal.a.01.horizontal').antonyms()wn.lemma('staccato.r.01.staccato').antonyms()# 语义相似度right = wn.synset('right_whale.n.01')orca = wn.synset('orca.n.01')minke = wn.synset('minke_whale.n.01')tortoise = wn.synset('tortoise.n.01')novel = wn.synset('novel.n.01')right.lowest_common_hypernyms(minke)right.lowest_common_hypernyms(orca)right.lowest_common_hypernyms(tortoise)right.lowest_common_hypernyms(novel)# 通过查找每个同义词集的深度来量化这个普遍性的概念wn.synset('baleen_whale.n.01').min_depth()wn.synset('whale.n.02').min_depth()wn.synset('vertebrate.n.01').min_depth()wn.synset('entity.n.01').min_depth()# 基于上位词层次概念中相互关联的最短路径下,在0~1范围内的相似度right.path_similarity(minke)right.path_similarity(orca)right.path_similarity(tortoise)right.path_similarity(novel)# 帮助help(wn)# VerbNetnltk.corpus.verbnet

 

转载于:https://www.cnblogs.com/yezuoxian/p/6709051.html

你可能感兴趣的文章
知道创宇 - 中文版putty后门事件分析
查看>>
电子政务信息交换平台与数据中心的构建(转)
查看>>
妄撮小游戏的开发思想-Android开发资料-《妄撮(撕开美女衣服)》游戏源代码外传...
查看>>
用ethtool确定多网卡Linux服务器网口位置
查看>>
Perl分割字符串的一个精妙的写法
查看>>
零点祝福
查看>>
上下文属性监听
查看>>
【小白的CFD之旅】10 敲门实例
查看>>
POI文件导出至EXCEL,并弹出下载框
查看>>
iOS 使用正则表达式库RegexKitLite的问题
查看>>
C#使用MemoryStream类读写内存
查看>>
MySQL内存使用-线程独享
查看>>
JDBC连接MySQL数据库及演示样例
查看>>
【WP8.1开发】基于应用的联系人存储
查看>>
AI新时代-教你使用python+Opencv完成人脸解锁(附源码)
查看>>
MongoDB ( 三 )高级_状态返回和安全
查看>>
基于 Netty 的可插拔业务通信协议的实现「1」协议描述及基本消息对象设计
查看>>
NodeJS介绍以及开发微信公众号Example
查看>>
新时代前端的自我修养—2017 D2主题分享记录及我的思考
查看>>
java并发编程学习14--CompletableFuture(一)
查看>>