圆月山庄资源网 Design By www.vgjia.com
我们给大家带来了关于学习python中scikit-learn机器代码的相关具体实例,以下就是全部代码内容:
# -*- coding: utf-8 -*- import numpy from sklearn import metrics from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn import linear_model from sklearn.datasets import load_iris from sklearn.cross_validation import train_test_split from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn import cross_validation from sklearn import preprocessing #import iris_data def load_data(): iris = load_iris() x, y = iris.data, iris.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42) return x_train,y_train,x_test,y_test def train_clf3(train_data, train_tags): clf = LinearSVC(C=1100.0)#default with 'rbf' clf.fit(train_data,train_tags) return clf def train_clf(train_data, train_tags): clf = MultinomialNB(alpha=0.01) print numpy.asarray(train_tags) clf.fit(train_data, numpy.asarray(train_tags)) return clf def evaluate(actual, pred): m_precision = metrics.precision_score(actual, pred) m_recall = metrics.recall_score(actual, pred) print 'precision:{0:.3f}'.format(m_precision) print 'recall:{0:0.3f}'.format(m_recall) print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred)); x_train,y_train,x_test,y_test = load_data() clf = train_clf(x_train, y_train) pred = clf.predict(x_test) evaluate(numpy.asarray(y_test), pred) print metrics.classification_report(y_test, pred) 使用自定义数据 # coding: utf-8 import numpy from sklearn import metrics from sklearn.feature_extraction.text import HashingVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.svm import LinearSVC import codecs from sklearn.ensemble import RandomForestClassifier from sklearn import cross_validation from sklearn import linear_model train_corpus = [ '我们 我们 好孩子 认证 。 就是', '我们 好孩子 认证 。 中国', '我们 好孩子 认证 。 孤独', '我们 好孩子 认证 。', ] test_corpus = [ '我 菲律宾 韩国', '我们 好孩子 认证 。 中国', ] def input_data(train_file, test_file): train_words = [] train_tags = [] test_words = [] test_tags = [] f1 = codecs.open(train_file,'r','utf-8','ignore') for line in f1: tks = line.split(':', 1) word_list = tks[1] word_array = word_list[1:(len(word_list)-3)].split(", ") train_words.append(" ".join(word_array)) train_tags.append(tks[0]) f2 = codecs.open(test_file,'r','utf-8','ignore') for line in f2: tks = line.split(':', 1) word_list = tks[1] word_array = word_list[1:(len(word_list)-3)].split(", ") test_words.append(" ".join(word_array)) test_tags.append(tks[0]) return train_words, train_tags, test_words, test_tags def vectorize(train_words, test_words): #v = HashingVectorizer(n_features=25000, non_negative=True) v = HashingVectorizer(non_negative=True) #v = CountVectorizer(min_df=1) train_data = v.fit_transform(train_words) test_data = v.fit_transform(test_words) return train_data, test_data def vectorize1(train_words, test_words): tv = TfidfVectorizer(sublinear_tf = False,use_idf=True); train_data = tv.fit_transform(train_words); tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_); test_data = tv2.fit_transform(test_words); return train_data, test_data def vectorize2(train_words, test_words): count_v1= CountVectorizer(stop_words = 'english', max_df = 0.5); counts_train = count_v1.fit_transform(train_words); count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_); counts_test = count_v2.fit_transform(test_words); tfidftransformer = TfidfTransformer(); train_data = tfidftransformer.fit(counts_train).transform(counts_train); test_data = tfidftransformer.fit(counts_test).transform(counts_test); return train_data, test_data def evaluate(actual, pred): m_precision = metrics.precision_score(actual, pred) m_recall = metrics.recall_score(actual, pred) print 'precision:{0:.3f}'.format(m_precision) print 'recall:{0:0.3f}'.format(m_recall) print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred)); def train_clf(train_data, train_tags): clf = MultinomialNB(alpha=0.01) clf.fit(train_data, numpy.asarray(train_tags)) return clf def train_clf1(train_data, train_tags): #KNN Classifier clf = KNeighborsClassifier()#default with k=5 clf.fit(train_data, numpy.asarray(train_tags)) return clf def train_clf2(train_data, train_tags): clf = linear_model.LogisticRegression(C=1e5) clf.fit(train_data,train_tags) return clf def train_clf3(train_data, train_tags): clf = LinearSVC(C=1100.0)#default with 'rbf' clf.fit(train_data,train_tags) return clf def train_clf4(train_data, train_tags): """ 随机森林,不可使用稀疏矩阵 """ clf = RandomForestClassifier(n_estimators=10) clf.fit(train_data.todense(),train_tags) return clf #使用codecs逐行读取 def codecs_read_label_line(filename): label_list=[] f = codecs.open(filename,'r','utf-8','ignore') line = f.readline() while line: #label_list.append(line[0:len(line)-2]) label_list.append(line[0:len(line)-1]) line = f.readline() f.close() return label_list def save_test_features(test_url, test_label): test_feature_list = codecs_read_label_line('test.dat') fw = open('test_labeded.dat',"w+") for (url,label) in zip(test_feature_list,test_label): fw.write(url+'\t'+label) fw.write('\n') fw.close() def main(): train_file = u'..\\file\\py_train.txt' test_file = u'..\\file\\py_test.txt' train_words, train_tags, test_words, test_tags = input_data(train_file, test_file) #print len(train_words), len(train_tags), len(test_words), len(test_words), train_data, test_data = vectorize1(train_words, test_words) print type(train_data) print train_data.shape print test_data.shape print test_data[0].shape print numpy.asarray(test_data[0]) clf = train_clf3(train_data, train_tags) scores = cross_validation.cross_val_score( clf, train_data, train_tags, cv=5, scoring="f1_weighted") print scores #predicted = cross_validation.cross_val_predict(clf, train_data,train_tags, cv=5) ''' ''' pred = clf.predict(test_data) error_list=[] for (true_tag,predict_tag) in zip(test_tags,pred): if true_tag != predict_tag: print true_tag,predict_tag error_list.append(true_tag+' '+predict_tag) print len(error_list) evaluate(numpy.asarray(test_tags), pred) ''' #输出打标签结果 test_feature_list = codecs_read_label_line('test.dat') save_test_features(test_feature_list, pred) ''' if __name__ == '__main__': main()
圆月山庄资源网 Design By www.vgjia.com
广告合作:本站广告合作请联系QQ:858582 申请时备注:广告合作(否则不回)
免责声明:本站文章均来自网站采集或用户投稿,网站不提供任何软件下载或自行开发的软件! 如有用户或公司发现本站内容信息存在侵权行为,请邮件告知! 858582#qq.com
免责声明:本站文章均来自网站采集或用户投稿,网站不提供任何软件下载或自行开发的软件! 如有用户或公司发现本站内容信息存在侵权行为,请邮件告知! 858582#qq.com
圆月山庄资源网 Design By www.vgjia.com
暂无评论...
P70系列延期,华为新旗舰将在下月发布
3月20日消息,近期博主@数码闲聊站 透露,原定三月份发布的华为新旗舰P70系列延期发布,预计4月份上市。
而博主@定焦数码 爆料,华为的P70系列在定位上已经超过了Mate60,成为了重要的旗舰系列之一。它肩负着重返影像领域顶尖的使命。那么这次P70会带来哪些令人惊艳的创新呢?
根据目前爆料的消息来看,华为P70系列将推出三个版本,其中P70和P70 Pro采用了三角形的摄像头模组设计,而P70 Art则采用了与上一代P60 Art相似的不规则形状设计。这样的外观是否好看见仁见智,但辨识度绝对拉满。
更新日志
2024年11月07日
2024年11月07日
- 雨林唱片《赏》新曲+精选集SACD版[ISO][2.3G]
- 罗大佑与OK男女合唱团.1995-再会吧!素兰【音乐工厂】【WAV+CUE】
- 草蜢.1993-宝贝对不起(国)【宝丽金】【WAV+CUE】
- 杨培安.2009-抒·情(EP)【擎天娱乐】【WAV+CUE】
- 周慧敏《EndlessDream》[WAV+CUE]
- 彭芳《纯色角3》2007[WAV+CUE]
- 江志丰2008-今生为你[豪记][WAV+CUE]
- 罗大佑1994《恋曲2000》音乐工厂[WAV+CUE][1G]
- 群星《一首歌一个故事》赵英俊某些作品重唱企划[FLAC分轨][1G]
- 群星《网易云英文歌曲播放量TOP100》[MP3][1G]
- 方大同.2024-梦想家TheDreamer【赋音乐】【FLAC分轨】
- 李慧珍.2007-爱死了【华谊兄弟】【WAV+CUE】
- 王大文.2019-国际太空站【环球】【FLAC分轨】
- 群星《2022超好听的十倍音质网络歌曲(163)》U盘音乐[WAV分轨][1.1G]
- 童丽《啼笑姻缘》头版限量编号24K金碟[低速原抓WAV+CUE][1.1G]