理論知識見:
https://blog.csdn.net/zwqjoy/article/details/80431496
https://sklearn.apachecn.org/docs/0.21.3/12.html
Bagging:
import itertools
# 相當多的牛逼閃閃的數學算法
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
#調整子圖位置大小
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score,train_test_split
#stacking利器:
#繪制學習曲線
from mlxtend.plotting import plot_learning_curves
#畫出logistic模型決策邊界
from mlxtend.plotting import plot_decision_regions
np.random.seed(0)
iris = datasets.load_iris()
X,y = iris.data[:,0:2],iris.target
clf1 = DecisionTreeClassifier(criterion='entropy',max_depth=1)
#決策樹的評價標準為信息熵
clf2 = KNeighborsClassifier(n_neighbors=1)
bagging1 = BaggingClassifier(base_estimator=clf1,n_estimators=10,max_samples=0.8,max_features=0.8)
bagging2 = BaggingClassifier(base_estimator=clf2,n_estimators=10,max_samples=0.8,max_features=0.8)
label = ['Decision Tree','K-NN','Bagging Tree','Bagging K-NN']
clf_list =[clf1,clf2,bagging1,bagging2]
fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2) #分為兩行兩列
grid = itertools.product([0,1],repeat=2) #result用for循環print:(0, 0)(0, 1)(1, 0)(1, 1)
#求笛卡爾積,前面的取兩個相乘,product(list1, list2)
for clf,label,grid in zip(clf_list,label,grid):#zip()見代碼末尾
scores = cross_val_score(clf,X,y,cv=3,scoring='accuracy')
# cv:選擇每次測試折數 accuracy:評價指標是準確度,可以省略使用默認值
# 交叉驗證
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
#求均值、標準偏差
clf.fit(X,y)
ax = plt.subplot(gs[grid[0],grid[1]])
fig = plot_decision_regions(X=X,y=y,clf=clf,legend=2)
plt.title(label)
plt.show()
#上圖顯示了決策樹和k-NN分類器的決策邊界,以及它們應用于Iris數據集的bagging集合。
# 決策樹顯示坐標軸平行邊界,
# 由于k-神經網絡對訓練樣本的擾動不敏感,因此稱為穩定學習器,
# 因此決策樹袋裝集成相對于k-神經網絡袋裝集成具有更高的精度
#plot learning curves
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
plt.figure()
plot_learning_curves(X_train,y_train,X_test,y_test,bagging1,print_model=False,style='ggplot')
plt.show()
#上圖顯示了裝袋樹集成的學習曲線。我們可以看到訓練數據的平均誤差為0.3,
# 測試數據的誤差曲線為u型。
# 訓練和測試錯誤之間的最小差距出現在訓練集大小為80%左右。
""">>>a = [1,2,3]
>>> b = [4,5,6]
>>> c = [4,5,6,7,8]
>>> zipped = zip(a,b) # 返回一個對象
>>> zipped
>>> list(zipped) # list() 轉換為列表
[(1, 4), (2, 5), (3, 6)]
>>> list(zip(a,c)) # 元素個數與最短的列表一致
[(1, 4), (2, 5), (3, 6)]
>>> a1, a2 = zip(*zip(a,b)) # 與 zip 相反,zip(*) 可理解為解壓,返回二維矩陣式
>>> list(a1)
[1, 2, 3]
>>> list(a2)
[4, 5, 6]
"""
Boosting:
import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score,train_test_split
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions
iris = datasets.load_iris()
X,y = iris.data[:,0:2],iris.target
clf = DecisionTreeClassifier(criterion='entropy',max_depth=1)
num_est = [1,2,3,10]
label = ['AdaBoost(n_est=1)','AdaBoost(n_est=2)','AdaBoost(n_est=3)','AdaBoost(n_est=10)']
fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2)
grid = itertools.product([0,1],repeat=2)
for n_est,label,grd in zip(num_est,label,grid):
boosting = AdaBoostClassifier(base_estimator=clf,n_estimators=n_est)
#n_estimator為樹的數量,默認為10個樹
boosting.fit(X,y)
ax = plt.subplot(gs[grd[0],grd[1]])
fig = plot_decision_regions(X=X,y=y,clf=boosting,legend=2)
plt.title(label)
# 每個基本學習器由一個深度為1的決策樹組成,
# 從而根據一個特征閾值對數據進行分類,
# 該特征閾值將空間劃分為兩個區域,
# 兩個區域之間由一個平行于其中一個軸的線性決策面分隔。
plt.show()
#plot learning curves
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
boosting = AdaBoostClassifier(base_estimator=clf,n_estimators=10)
plt.figure()
plot_learning_curves(X_train,y_train,X_test,y_test,boosting,print_model=False,style='ggplot')
plt.show()
Stacking:
import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions
iris = datasets.load_iris()
X,y = iris.data[:,1:3],iris.target
clf1 = KNeighborsClassifier(n_neighbors=1)#n_neighbors為選取最近的點的個數
clf2 = RandomForestClassifier(random_state=1)#隨機數生成器使用的種子
clf3 = GaussianNB()#樸素貝葉斯
lr = LogisticRegression()#meta_classifier元分類器
sclf = StackingClassifier(classifiers=[clf1,clf2,clf3],meta_classifier=lr)
label = ['KNN','RandomForest','Naive Bayes','Stacking Classifier']
clf_list = [clf1,clf2,clf3,sclf]
fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2)
grid = itertools.product([0,1],repeat=2)
clf_cv_mean,clf_cv_std= [],[]
for clf,label,grd in zip(clf_list,label,grid):
scores = cross_val_score(clf,X,y,cv=3,scoring='accuracy')
print("Accuracy:均值:%.2f(+/- 標準差:%.2f) [%s]" %(scores.mean(),scores.std(),label))
#標準差(方差)越小越穩定
clf_cv_mean.append(scores.mean())
clf_cv_std.append(scores.std())
clf.fit(X,y)
ax = plt.subplot(gs[grd[0],grd[1]])
fig = plot_decision_regions(X=X,y=y,clf=clf)
plt.title(label)
plt.show()
#plot learning curves
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
plt.figure()
plot_learning_curves(X_train,y_train,X_test,y_test,sclf,print_model=True,style='ggplot')
plt.show()
# 我們可以看到,疊加比單個分類器獲得更高的精度,從學習曲線上看,沒有過度擬合的跡象。
?
更多文章、技術交流、商務合作、聯系博主
微信掃碼或搜索:z360901061

微信掃一掃加我為好友
QQ號聯系: 360901061
您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
