【#文档大全网# 导语】以下是®文档大全网的小编为您整理的《文本分类python代码》,欢迎阅读!
#!/usr/bin/env python # -*-coding:utf8-*-
importos
import codecs
fromsklearn.feature_extraction.text import TfidfVectorizer importnltk
fromsklearn.naive_bayes import MultinomialNB fromsklearn.linear_model import SGDClassifier
def tokenize(text):
tokens = nltk.word_tokenize(text)
# stems = stem_tokens(tokens, stemmer) return tokens
defread_corpus(topics):
print "------------start-------------" token_dict = dict() y_train = [] fori in range(6):
dROOT_SUB = u'./data/topic_corpus_cut/' + topics[i].decode("utf8") + '/' count = 0
forsubdir, dirs, files in os.walk(dROOT_SUB): for file in files:
file_path = subdir + os.path.sep + file
shakes = codecs.open(file_path, "r", "utf-8") text = shakes.read()
token_dict[file] = text # no_punctuation count = count + 1
y_train.extend([i] * count)
token_dict_keys = token_dict.keys() returntoken_dict, y_train
# def train
deftrain_model(token_dict): # this can take some time
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=None, max_features=400) tfs = tfidf.fit_transform(token_dict.values()) printtfs.shape returntfs, tfidf
if __name__ == '__main__': dROOT = u'./data/topics/'
topics = ['体育', '社会', '管理']
token_dict, y_train = read_corpus(topics) X_train, tfidf = train_model(token_dict)
parameters = {
'loss': 'hinge', 'penalty': 'l2', 'n_iter': 50,
'alpha': 0.00001, 'fit_intercept': True, }
#parameters = {'alpha': 0.01}
#clf = MultinomialNB(**parameters).fit(X_train, y_train) clf = SGDClassifier(**parameters).fit(X_train, y_train)
#X_test_str = u'政府采购好事方向应该支持运行急待改进完善提高专业性数额急需采购东西放权专业技术人员采购人员专业手续繁杂东西差价时间影响工作采购也许腐败'
X_test = tfidf.transform([X_test_str]) pred = clf.predict(X_test) printpred
本文来源:https://www.wddqxz.cn/4a0870e7fc4ffe473268ab59.html