Python自然語言處理(一):Topic Modeling主題模型(LDA)
1 min readDec 13, 2019
#匯入模組與資料並進行檢視import pandas as pdreviews_data = pd.read_csv('Hotel_Reviews.csv')reviews_data.head()#使用scikit-learn進行向量轉換
#忽略在文章中佔了90%的文字(即去除高頻率字彙)
#文字至少出現在2篇文章中才進行向量轉換from sklearn.feature_extraction.text import CountVectorizercv = CountVectorizer(max_df = 0.9, min_df =2, stop_words = 'english')df = df.dropnadtm = cv.fit_transform(df['Positive_Review'])#使用LDA演算法from sklearn.decomposition import LatentDirichletAllocationLDA = LatentDirichletAllocation(n_components=5, random_state=42)LDA.fit(dtm)#n_components => 想分成幾群#random_state => 設定成42#觀看結果for i,topic in enumerate(LDA.components_):
print(f"TOP 10 WORDS PER TOPIC #{i}")
print([cv.get_feature_names()[index] for index in topic.argsort()[-10:]])