{"id":473,"date":"2019-07-12T20:23:00","date_gmt":"2019-07-12T12:23:00","guid":{"rendered":"http:\/\/note.systw.net\/note\/?p=473"},"modified":"2023-11-02T20:24:52","modified_gmt":"2023-11-02T12:24:52","slug":"sklearn-dataset","status":"publish","type":"post","link":"https:\/\/systw.net\/note\/archives\/473","title":{"rendered":"SKLearn Dataset"},"content":{"rendered":"\n<p>\u8cc7\u6599\u96c6\u7684\u4f86\u6e90\u4e3b\u8981\u6709\u5169\u7a2e\uff0c\u5167\u5efa\u8cc7\u6599\u96c6\uff0c\u6216\u662f\u7531\u5916\u90e8\u8f09\u5165\u7684\u8cc7\u6599\u96c6<\/p>\n\n\n\n<p>&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;..<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u8f09\u5165\u5167\u5efa\u8cc7\u6599\u96c6<\/h2>\n\n\n\n<p><br><strong>\u8f09\u5165\u8cc7\u6599\u96c6<\/strong><br>&gt;&gt;&gt;from sklearn import datasets<\/p>\n\n\n\n<p><br><strong>iris\u8cc7\u6599\u96c6<\/strong><br>\u5305\u542b\u4e09\u7a2e\u4e0d\u540c\u7a2e\u985e\u7684\u82b1\u6735\u7279\u5fb5\u8cc7\u6599<br>ex:<br>&gt;&gt;&gt; from sklearn import datasets<br>&gt;&gt;&gt; iris = datasets.load_iris()<br>&gt;&gt;&gt; iris.data.shape<br>(150, 4)<br>&gt;&gt;&gt;print(iris.data)<br>[[ 5.1 3.5 1.4 0.2]<br>[ 4.9 3. 1.4 0.2]<br>[ 4.7 3.2 1.3 0.2]<br>[ 4.6 3.1 1.5 0.2]<br>&#8230;omit&#8230;<br>&gt;&gt;&gt;print(iris.target)<br>[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0<br>0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1<br>1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2<br>2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2<br>2 2]<br>ps<br>iris.data \u5c31\u6703\u662f\u771f\u5be6\u7684 data \u8cc7\u6599,&nbsp;<br>iris.target \u5c31\u662f\u8981\u88ab training \u7684 classes<br>iris.data.shape \u5c31\u662f\u8cc7\u6599\u7684\u7b46\u6578\u548c\u7279\u5fb5\u6578<\/p>\n\n\n\n<p><br><strong>\u75be\u75c5\u56b4\u91cd\u7a0b\u5ea6\u9810\u6e2c<\/strong><br>diabete\u8cc7\u6599\u96c6\u5305\u542b442\u4f4d\u75c5\u4eba\u7684\u8cc7\u6599,\u6bcf\u500b\u75c5\u4eba\u6536\u96c6\u5341\u7a2e\u7279\u5fb5<br>ex:<br>&gt;&gt;&gt; from sklearn import datasets<br>&gt;&gt;&gt; diabetes = datasets.load_diabetes()<br>&gt;&gt;&gt; print(diabetes.data.shape)<br>(442, 10)<br>&gt;&gt;&gt; print diabetes.target<br>[ 151. 75. 141. 206. 135. 97. 138. 63. 110. 310. 101. 69.<br>179. 185. 118. 171. 166. 144. 97. 168. 68. 49. 68. 245.<br>184. 202. 137. 85. 131. 283. 129. 59. 341. 87. 65. 102.<br>265. 276. 252. 90. 100. 55. 61. 92. 259. 53. 190. 142.<br>75. 142. 155. 225. 59. 104. 182. 128. 52. 37. 170. 170.<br>&#8230;omit&#8230;<\/p>\n\n\n\n<p><br><strong>\u505a\u5716\u7247\u7684\u6587\u5b57\u8fa8\u8b58<\/strong><br>\u4f7f\u7528\u7684\u662fUCI\u7684digits\u8cc7\u6599\u96c6,\u5171\u67091797\u7b4664pixel\u7684\u7070\u968e\u5716\u7247<br>ex:<br>&gt;&gt;&gt; from sklearn import datasets<br>&gt;&gt;&gt; digits = datasets.load_digits()<\/p>\n\n\n\n<p><br><strong>boston\u623f\u50f9<\/strong><br>\u9069\u5408\u505aregression<br>ex:<br>&gt;&gt;&gt; from sklearn import datasets<br>&gt;&gt;&gt; boston = datasets.load_boston()<br>&gt;&gt;&gt; boston.data.shape<br>(506, 13)<\/p>\n\n\n\n<p>refer<br>http:\/\/www.cs.toronto.edu\/~delve\/data\/boston\/bostonDetail.html<\/p>\n\n\n\n<p>&#8230;<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u8b80\u53d6\u5916\u90e8\u8cc7\u6599\u96c6&nbsp;<\/h2>\n\n\n\n<p><strong>\u8f09\u5165\u5916\u90e8\u8cc7\u6599<\/strong><br>sklearn.datasets.load_files(container_path, description=None, categories=None, load_content=True, shuffle=True,encoding=None, charset=None, charset_error=None, decode_error=&#8217;strict&#8217;, random_state=0)<\/p>\n\n\n\n<p>ex:<br>from sklearn.datasets import load_files<br>load_files(container_path)<br>ps:<br>\u8b80\u53d6\u5916\u90e8\u8cc7\u6599\u4e5f\u53ef\u7528numpy loadtxt<\/p>\n\n\n\n<p><br><br>&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u8cc7\u6599\u5207\u5272<\/h2>\n\n\n\n<p>\u8f09\u5165\u8cc7\u6599\u5f8c\u901a\u5e38\u90fd\u6703\u5207\u62102\u90e8\u4efd\u505a\u70batraining dataset\u548ctesting dataset\uff0c\u4ee5\u4e0b\u662f\u5e38\u898b\u7684\u5207\u6cd5<\/p>\n\n\n\n<p><strong>\u8cc7\u6599\u5207\u5272<\/strong><br>ex<br>\u5012\u657830\u7b46\u505a\u70batest data,\u5176\u9918\u70batrain data<br>&gt;&gt;&gt; from sklearn import datasets<br>&gt;&gt;&gt; diabetes = datasets.load_diabetes()<br>&gt;&gt;&gt; print(diabetes.data.shape)<br>(442, 10)<br>&gt;&gt;&gt; diabetes_train_data = diabetes.data[:-30]<br>&gt;&gt;&gt; diabetes_train_target = diabetes.target[:-30]<br>&gt;&gt;&gt; diabetes_test_data = diabetes.data[-30:]<br>&gt;&gt;&gt; diabetes_test_target = diabetes.target[-30:]<\/p>\n\n\n\n<p><br><strong>\u8cc7\u6599\u5207\u5272by train_test_split<\/strong><br><strong>1<br>\u8f09\u5165cross_validation<\/strong><br>from sklearn.cross_validation import train_test_split<br><strong>2<br>\u5207\u5272<\/strong><br>d_train, d_test, t_train, t_test = train_test_split(data, target, test_size=0.2, random_state=0)<br>test_size \u8a2d\u5b9a\u5e7e\uff05\u8cc7\u6599\u70batestdata,\u5176\u9918\u70batraindata<br>random_state \u5c07\u4e82\u6578\u7684\u72c0\u614b\u56fa\u5b9a, \u76f8\u540c\u6578\u503c\u6703\u8b93\u6bcf\u6b21\u57f7\u884c\u6642\u8cc7\u6599\u5207\u5272\u90fd\u6703\u4e00\u6a23,<br>ps:\u4e0d\u8a2drandom_state\u8cc7\u6599\u6703\u6b21\u90fd\u6703\u6253\u4e82<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>ex:<br>from sklearn.cross_validation import train_test_split<br>train_data, test_data, train_target, test_target = train_test_split(iris.data,iris.target, test_size=0.2, random_state=0)<br>print 'train total:',len(train_target),', target:',sum(train_target)<br>print 'test total:',len(test_target),', target:',sum(test_target)<\/code><\/pre>\n\n\n\n<p><strong>\u8cc7\u6599\u5207\u5272by ShuffleSplit<\/strong><br><strong>1<br>\u8f09\u5165cross_validation<\/strong><br>from sklearn.cross_validation import ShuffleSplit,cross_val_score<br><strong>2<br>\u5207\u5272<\/strong><br>cv = ShuffleSplit(n_samples, n_iter=3, test_size=0.1, random_state=0)<br>\u3000test_size=0.1 \u8a2d\u5b9a\u5e7e\uff05\u8cc7\u6599\u70batestdata,\u5176\u9918\u70batraindata<br>\u3000n_iter=3 \u505a\u6210 3 \u4efd,\u4e5f\u5c31\u662f 3-fold cv<br>\u3000n_samples \u5e7e\u4efd\u8cc7\u6599<br><strong>3<br>\u6839\u64daclf \u8a08\u7b97\u5404\u5225\u5206\u6578<\/strong><br>test_scores = cross_val_score(clf, data, target, cv=cv, n_jobs=2)<br>\u3000clf = &lt; any classifier&gt;<br>print test_scores<br><br>\u3000<br>refer\u3000<br>http:\/\/hhtucode.blogspot.tw\/2013\/10\/python-ml-with-scikit-learn-model.html<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u8cc7\u6599\u96c6\u7684\u4f86\u6e90\u4e3b\u8981\u6709\u5169\u7a2e\uff0c\u5167\u5efa\u8cc7\u6599\u96c6\uff0c\u6216\u662f\u7531\u5916\u90e8\u8f09\u5165\u7684\u8cc7\u6599\u96c6  &#8230;<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"fifu_image_url":"","fifu_image_alt":"","_jetpack_memberships_contains_paid_content":false,"footnotes":"","jetpack_publicize_message":"","jetpack_publicize_feature_enabled":true,"jetpack_social_post_already_shared":false,"jetpack_social_options":{"image_generator_settings":{"template":"highway","default_image_id":0,"font":"","enabled":false},"version":2}},"categories":[13],"tags":[],"class_list":["post-473","post","type-post","status-publish","format-standard","hentry","category-dataanalysis"],"jetpack_publicize_connections":[],"jetpack_featured_media_url":"","jetpack_sharing_enabled":true,"_links":{"self":[{"href":"https:\/\/systw.net\/note\/wp-json\/wp\/v2\/posts\/473","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/systw.net\/note\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/systw.net\/note\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/systw.net\/note\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/systw.net\/note\/wp-json\/wp\/v2\/comments?post=473"}],"version-history":[{"count":0,"href":"https:\/\/systw.net\/note\/wp-json\/wp\/v2\/posts\/473\/revisions"}],"wp:attachment":[{"href":"https:\/\/systw.net\/note\/wp-json\/wp\/v2\/media?parent=473"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/systw.net\/note\/wp-json\/wp\/v2\/categories?post=473"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/systw.net\/note\/wp-json\/wp\/v2\/tags?post=473"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}