20 newsgroupsデータの紹介とテキスト分類の例
3774 ワード
概要 20 newsgroups
データセット18000編のニュース記事は、全部で20種類の話題に及ぶため、20 newsgroups text dataset
と呼ばれ、2つの部分:訓練セットとテストセット、通常はテキスト分類に用いる.
きほんしよう
sklearnはこのデータのインタフェースを提供しています:sklearn.datasets.fetch_20newsgroups
、sklearnのドキュメントでこのデータセットの使用方法を説明します.from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
newsgroups_train = fetch_20newsgroups(subset='train')
pprint(list(newsgroups_train.targernames))
全部で20種類あります['alt.atheism',
'comp.graphics',
'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware',
'comp.sys.mac.hardware',
'comp.windows.x',
'misc.forsale',
'rec.autos',
'rec.motorcycles',
'rec.sport.baseball',
'rec.sport.hockey',
'sci.crypt',
'sci.electronics',
'sci.med',
'sci.space',
'soc.religion.christian',
'talk.politics.guns',
'talk.politics.mideast',
'talk.politics.misc',
'talk.religion.misc']
データnewsgroups_train
のいくつかのプロパティを見てみましょうprint(newsgroups_train.filenames.shape) # (11314,)
print(newsgroups_train.target.shape) # (11314,)
print(newsgroups_train.target[:10]) # [ 7 4 4 1 14 16 13 3 2 4]
print(newsgroups_train['data'][:2]) # ["From: [email protected] (where's my thin...
fetch_20newsgroups
のパラメータ設定:fetch_20newsgroups(data_home=None, #
subset='train', # train/test
categories=None, # [ ], 20
shuffle=True, #
random_state=42, #
remove=(), # ('headers','footers','quotes')
download_if_missing=True # ,
)
テキストをTF-IDFベクトルに変換 from sklearn.feature_extraction.text import TfidfVectorizer
#
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
#
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
# tfidf
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
print(vectors.shape)
print(vectors.nnz / float(vectors.shape[0]))
#
(2034, 34118)
159.0132743362832
出力から,抽出したTF‐IDFベクトルは非常に疎であり,30000次元を超える特性は159個の非ゼロ特性を持つことが分かった.
ベイズによる分類 from sklearn.feature_extraction.text import TfidfVectorizer
#
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
#
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
# tfidf
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
print(vectors.shape)
print(vectors.nnz / float(vectors.shape[0]))
# MultinomialNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,f1_score
#
newsgroups_test=fetch_20newsgroups(subset='test',categories=categories)
# tfidf
vectors_test=vectorizer.transform(newsgroups_test.data)
#
clf=MultinomialNB(alpha=0.1)
clf.fit(vectors,newsgroups_train.target)
#
pred=clf.predict(vectors_test)
print(f1_score(newsgroups_test.target,pred,average='macro'))
print(accuracy_score(newsgroups_test.target,pred))
#
f1_score: 0.8823530044163621
accuracy: 0.8965262379896526
リファレンス
データセットアドレス:http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.htmlsklearn 20 newsgroupについての紹介http://scikit-learn.org/stable/datasets/twenty_newsgroups.html
sklearnはこのデータのインタフェースを提供しています:
sklearn.datasets.fetch_20newsgroups
、sklearnのドキュメントでこのデータセットの使用方法を説明します.from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
newsgroups_train = fetch_20newsgroups(subset='train')
pprint(list(newsgroups_train.targernames))
全部で20種類あります
['alt.atheism',
'comp.graphics',
'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware',
'comp.sys.mac.hardware',
'comp.windows.x',
'misc.forsale',
'rec.autos',
'rec.motorcycles',
'rec.sport.baseball',
'rec.sport.hockey',
'sci.crypt',
'sci.electronics',
'sci.med',
'sci.space',
'soc.religion.christian',
'talk.politics.guns',
'talk.politics.mideast',
'talk.politics.misc',
'talk.religion.misc']
データ
newsgroups_train
のいくつかのプロパティを見てみましょうprint(newsgroups_train.filenames.shape) # (11314,)
print(newsgroups_train.target.shape) # (11314,)
print(newsgroups_train.target[:10]) # [ 7 4 4 1 14 16 13 3 2 4]
print(newsgroups_train['data'][:2]) # ["From: [email protected] (where's my thin...
fetch_20newsgroups
のパラメータ設定:fetch_20newsgroups(data_home=None, #
subset='train', # train/test
categories=None, # [ ], 20
shuffle=True, #
random_state=42, #
remove=(), # ('headers','footers','quotes')
download_if_missing=True # ,
)
テキストをTF-IDFベクトルに変換 from sklearn.feature_extraction.text import TfidfVectorizer
#
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
#
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
# tfidf
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
print(vectors.shape)
print(vectors.nnz / float(vectors.shape[0]))
#
(2034, 34118)
159.0132743362832
出力から,抽出したTF‐IDFベクトルは非常に疎であり,30000次元を超える特性は159個の非ゼロ特性を持つことが分かった.
ベイズによる分類 from sklearn.feature_extraction.text import TfidfVectorizer
#
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
#
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
# tfidf
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
print(vectors.shape)
print(vectors.nnz / float(vectors.shape[0]))
# MultinomialNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,f1_score
#
newsgroups_test=fetch_20newsgroups(subset='test',categories=categories)
# tfidf
vectors_test=vectorizer.transform(newsgroups_test.data)
#
clf=MultinomialNB(alpha=0.1)
clf.fit(vectors,newsgroups_train.target)
#
pred=clf.predict(vectors_test)
print(f1_score(newsgroups_test.target,pred,average='macro'))
print(accuracy_score(newsgroups_test.target,pred))
#
f1_score: 0.8823530044163621
accuracy: 0.8965262379896526
リファレンス
データセットアドレス:http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.htmlsklearn 20 newsgroupについての紹介http://scikit-learn.org/stable/datasets/twenty_newsgroups.html
from sklearn.feature_extraction.text import TfidfVectorizer
#
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
#
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
# tfidf
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
print(vectors.shape)
print(vectors.nnz / float(vectors.shape[0]))
#
(2034, 34118)
159.0132743362832
from sklearn.feature_extraction.text import TfidfVectorizer
#
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
#
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
# tfidf
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
print(vectors.shape)
print(vectors.nnz / float(vectors.shape[0]))
# MultinomialNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,f1_score
#
newsgroups_test=fetch_20newsgroups(subset='test',categories=categories)
# tfidf
vectors_test=vectorizer.transform(newsgroups_test.data)
#
clf=MultinomialNB(alpha=0.1)
clf.fit(vectors,newsgroups_train.target)
#
pred=clf.predict(vectors_test)
print(f1_score(newsgroups_test.target,pred,average='macro'))
print(accuracy_score(newsgroups_test.target,pred))
#
f1_score: 0.8823530044163621
accuracy: 0.8965262379896526