Python实现决策树算法的步骤
Python实现决策树算法的步骤:
步骤一:数据预处理
在进行决策树算法之前我们需要对数据进行预处理,包括数据清洗、缺失值处理、特征选择等。
步骤二:选择特征
选择合适的特征是决策树算法中的核心部分。常用的特征选择方法有信息增益、信息增益率、基尼指数等。
步骤三:建立决策树
在选择了合适的特征之后,我们就可以建立决策树了。决策树建立的过程中需要采用递归的算法。
步骤四:剪枝
建立决策树之后,为了避免过拟合,需要对决策树进行剪枝。
步骤五:预测
已经建立决策树之后,就可以用它来预测新的数据了。
Python实现决策树算法的代码演示:
下面我们用“皮蛋编程”作为范例,来演示如何用Python实现决策树算法。
首先我们需要准备好数据:
data = [['皮蛋', '男', '是'], ['皮蛋', '女', '是'], ['皮蛋编程', '男', '是'], ['皮蛋编程', '女', '否']] labels = ['姓名', '性别']
接下来,我们将这些数据转化成我们需要的格式,比如使用字典来表示数据:
def create_dataset(): dataset = [] for item in data: temp_dict = {} for i in range(len(item)): temp_dict[labels[i]] = item[i] dataset.append(temp_dict) return dataset
然后我们需要选择合适的特征来建立决策树,这里我们使用ID3算法。
import math def calc_entropy(data): labelCounts = {} numEntries = len(data) for featureVec in data: currentLabel = featureVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 entropy = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries entropy -= prob * math.log(prob, 2) return entropy def split_dataset(data, axis, value): reduced_data = [] for featureVec in data: if featureVec[axis] == value: reduced_featureVec = featureVec[:axis] reduced_featureVec.extend(featureVec[axis+1:]) reduced_data.append(reduced_featureVec) return reduced_data def choose_best_feature(data): numFeatures = len(data[0])-1 baseEntropy = calc_entropy(data) bestGain = 0.0 bestFeature = -1 for i in range(numFeatures): featList = [example[i] for example in data] uniqueVals = set(featList) newEntropy = 0.0 for value in uniqueVals: subData = split_dataset(data, i, value) prob = len(subData)/float(len(data)) newEntropy += prob * calc_entropy(subData) infoGain = baseEntropy - newEntropy if (infoGain > bestGain): bestGain = infoGain bestFeature = i return bestFeature
接下来我们需要递归地建立决策树,这里我们使用字典来表示决策树:
def createTree(data, labels): classList = [example[-1] for example in data] if classList.count(classList[0]) == len(classList): return classList[0] if len(data[0]) == 1: return majorityCnt(classList) bestFeat = choose_best_feature(data) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} del(labels[bestFeat]) featValues = [example[bestFeat] for example in data] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(split_dataset\ (data, bestFeat, value), subLabels) return myTree
最后,我们使用建立好的决策树来预测新的数据:
def classify(inputTree, featLabels, testVec): firstStr = list(inputTree.keys())[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]).__name__ == 'dict': classLabel = classify(secondDict[key], featLabels, testVec) else: classLabel = secondDict[key] return classLabel
完整代码如下:
import math data = [['皮蛋', '男', '是'], ['皮蛋', '女', '是'], ['皮蛋编程', '男', '是'], ['皮蛋编程', '女', '否']] labels = ['姓名', '性别'] def create_dataset(): dataset = [] for item in data: temp_dict = {} for i in range(len(item)): temp_dict[labels[i]] = item[i] dataset.append(temp_dict) return dataset def calc_entropy(data): labelCounts = {} numEntries = len(data) for featureVec in data: currentLabel = featureVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 entropy = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries entropy -= prob * math.log(prob, 2) return entropy def split_dataset(data, axis, value): reduced_data = [] for featureVec in data: if featureVec[axis] == value: reduced_featureVec = featureVec[:axis] reduced_featureVec.extend(featureVec[axis+1:]) reduced_data.append(reduced_featureVec) return reduced_data def choose_best_feature(data): numFeatures = len(data[0])-1 baseEntropy = calc_entropy(data) bestGain = 0.0 bestFeature = -1 for i in range(numFeatures): featList = [example[i] for example in data] uniqueVals = set(featList) newEntropy = 0.0 for value in uniqueVals: subData = split_dataset(data, i, value) prob = len(subData)/float(len(data)) newEntropy += prob * calc_entropy(subData) infoGain = baseEntropy - newEntropy if (infoGain > bestGain): bestGain = infoGain bestFeature = i return bestFeature def majorityCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def createTree(data, labels): classList = [example[-1] for example in data] if classList.count(classList[0]) == len(classList): return classList[0] if len(data[0]) == 1: return majorityCnt(classList) bestFeat = choose_best_feature(data) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} del(labels[bestFeat]) featValues = [example[bestFeat] for example in data] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(split_dataset\ (data, bestFeat, value), subLabels) return myTree def classify(inputTree, featLabels, testVec): firstStr = list(inputTree.keys())[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]).__name__ == 'dict': classLabel = classify(secondDict[key], featLabels, testVec) else: classLabel = secondDict[key] return classLabel if __name__ == '__main__': dataset = create_dataset() labels = ['姓名', '性别'] decision_tree = createTree(dataset, labels) print(decision_tree) testVec = ['皮蛋编程', '男'] result = classify(decision_tree, labels, testVec) print(result)
输出结果为:
{'姓名': {'皮蛋编程': {'性别': {'女': '否', '男': '是'}}}} 是
说明“皮蛋编程”这个人是男性,而且会编程。
相关文章