1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
| from numpy import *
def loadDataSet(fileName): #general function to parse tab -delimited floats dataMat = [] #assume last column is target value fr = open(fileName) for line in fr.readlines(): curLine = line.strip().split('\t') fltLine = map(float,curLine) #map all elements to float() dataMat.append(fltLine) return dataMat def plotBestFit(file): #画出数据集 import matplotlib.pyplot as plt dataMat=loadDataSet(file) #数据矩阵和标签向量 dataArr = array(dataMat) #转换成数组 n = shape(dataArr)[0] xcord1 = []; ycord1 = [] #声明两个不同颜色的点的坐标 #xcord2 = []; ycord2 = [] for i in range(n): xcord1.append(dataArr[i,0]); ycord1.append(dataArr[i,1]) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xcord1, ycord1, s=30, c='green', marker='s') #ax.scatter(xcord2, ycord2, s=30, c='green') plt.xlabel('X1'); plt.ylabel('X2'); plt.show()
def binSplitDataSet(dataSet, feature, value): #该函数通过数组过滤方式将数据集合切分得到两个子集并返回 mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:][0] mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:][0] return mat0,mat1
def regLeaf(dataSet): #建立叶节点函数,value为所有y的均值 return mean(dataSet[:,-1])
def regErr(dataSet): #平方误差计算函数 return var(dataSet[:,-1]) * shape(dataSet)[0] #y的方差×y的数量=平方误差
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)): #最佳二元切分方式 tolS = ops[0]; tolN = ops[1] #tolS是容许的误差下降值,tolN是切分的最少样本数 #如果剩余特征值的数量等于1,不需要再切分直接返回,(退出条件1) if len(set(dataSet[:,-1].T.tolist()[0])) == 1: return None, leafType(dataSet) m,n = shape(dataSet) #the choice of the best feature is driven by Reduction in RSS error from mean S = errType(dataSet) #计算平方误差 bestS = inf; bestIndex = 0; bestValue = 0 for featIndex in range(n-1): #循环整个集合 for splitVal in set(dataSet[:,featIndex]): #每次返回的集合中,元素的顺序都将不一样 mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) #将数据集合切分得到两个子集 #如果划分的集合的大小小于切分的最少样本数,重新划分 if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue newS = errType(mat0) + errType(mat1) #计算两个集合的平方误差和 #平方误差和newS小于bestS,进行更新 if newS < bestS: bestIndex = featIndex bestValue = splitVal bestS = newS #在循环了整个集合后,如果误差减少量(S - bestS)小于容许的误差下降值,则退出,(退出条件2) if (S - bestS) < tolS: return None, leafType(dataSet) mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) #按照保存的最佳分割来划分集合 #如果切分出的数据集小于切分的最少样本数,则退出,(退出条件3) if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): return None, leafType(dataSet) #返回最佳二元切割的bestIndex和bestValue return bestIndex,bestValue
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering feat, val = chooseBestSplit(dataSet, leafType, errType, ops) #采用最佳分割,将数据集分成两个部分 if feat == None: return val #递归结束条件 retTree = {} #建立返回的字典 retTree['spInd'] = feat retTree['spVal'] = val lSet, rSet = binSplitDataSet(dataSet, feat, val) #得到左子树集合和右子树集合 retTree['left'] = createTree(lSet, leafType, errType, ops) #递归左子树 retTree['right'] = createTree(rSet, leafType, errType, ops) #递归右子树 return retTree
|