安裝NLTK
pip install nltk
安裝NLTK包
import nltk
nltk.download()
#跳出GUI界面,下載需要的資料
載入內建的書籍文字檔
#共九本書(如果找不到書的話,依照螢幕提示進行下載)
from nltk.book import *
計算單字頻率並繪圖
from bs4 import BeautifulSoup
import urllib.request
import nltk
response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup = BeautifulSoup(html,"html5lib")
text = soup.get_text(strip=True)
tokens = [t for t in text.split()]
freq = nltk.FreqDist(tokens)
for key,val in freq.items():
print (str(key) + ':' + str(val))freq.plot(20, cumulative=False)
移除停用詞Stop Words
停用詞大致分為兩類。
1)人類語言中包含的功能詞,如'the'、'is'、'at'、'which'、'on'等。
2)詞彙詞,比如'want'等,這些詞應用十分廣泛,但是對這樣的詞搜尋引擎無法保證能夠給出真正相關的搜索結果。#stopwords必須使用nltk.download()下載from bs4 import BeautifulSoup
import urllib.request
import nltk
from nltk.corpus import stopwords
response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup = BeautifulSoup(html,"html5lib")
text = soup.get_text(strip=True)
tokens = [t for t in text.split()]
clean_tokens = tokens[:]
sr = stopwords.words('english')
for token in tokens:
if token in stopwords.words('english'):
clean_tokens.remove(token)
freq = nltk.FreqDist(clean_tokens)
for key,val in freq.items():
print (str(key) + ':' + str(val))freq.plot(20,cumulative=False)可使用1984這本小說來分析看看,哪些是高頻率單字如下連結http://gutenberg.net.au/ebooks01/0100021.txt
分開英文句子(斷句)
from nltk.tokenize import sent_tokenizemytext = "Hello Adam, how are you? I hope everything is going well. Today is a good day, see you dude."print(sent_tokenize(mytext))['Hello Adam, how are you?', 'I hope everything is going well.', 'Today is a good day, see you dude.']
分開英文單字(斷詞)
from nltk.tokenize import word_tokenizemytext = "Hello Mr. Adam, how are you? I hope everything is going well. Today is a good day, see you dude."print(word_tokenize(mytext))['Hello', 'Mr.', 'Adam', ',', 'how', 'are', 'you', '?', 'I', 'hope', 'everything', 'is', 'going', 'well', '.', 'Today', 'is', 'a', 'good', 'day', ',', 'see', 'you', 'dude', '.']
分開非英文文字
from nltk.tokenize import sent_tokenizemytext = "Bonjour M. Adam, comment allez-vous? J'espère que tout va bien. Aujourd'hui est un bon jour."print(sent_tokenize(mytext,"french"))['Bonjour M. Adam, comment allez-vous?', "J'espère que tout va bien.", "Aujourd'hui est un bon jour."]
使用WordNet顯示同義/反義詞
#wordnet必須使用nltk.download()下載from nltk.corpus import wordnetsyn = wordnet.synsets("pain")print(syn[0].definition())print(syn[0].examples())#用For迴圈取得大量相關代名詞from nltk.corpus import wordnet
synonyms = []
for syn in wordnet.synsets('Computer'):
for lemma in syn.lemmas():
synonyms.append(lemma.name())
print(synonyms)#用For迴圈取得大量反義詞
from nltk.corpus import wordnet
antonyms = []
for syn in wordnet.synsets("small"):
for l in syn.lemmas():
if l.antonyms():
antonyms.append(l.antonyms()[0].name())
print(antonyms)