Python实现决策树算法的步骤

2023-04-14 00:00:00 算法 步骤 决策树

Python实现决策树算法的步骤:

步骤一:数据预处理

在进行决策树算法之前我们需要对数据进行预处理,包括数据清洗、缺失值处理、特征选择等。

步骤二:选择特征

选择合适的特征是决策树算法中的核心部分。常用的特征选择方法有信息增益、信息增益率、基尼指数等。

步骤三:建立决策树

在选择了合适的特征之后,我们就可以建立决策树了。决策树建立的过程中需要采用递归的算法。

步骤四:剪枝

建立决策树之后,为了避免过拟合,需要对决策树进行剪枝。

步骤五:预测

已经建立决策树之后,就可以用它来预测新的数据了。

Python实现决策树算法的代码演示:

下面我们用“皮蛋编程”作为范例,来演示如何用Python实现决策树算法。

首先我们需要准备好数据:

data = [['皮蛋', '男', '是'],
    ['皮蛋', '女', '是'],
    ['皮蛋编程', '男', '是'],
    ['皮蛋编程', '女', '否']]
labels = ['姓名', '性别']

接下来,我们将这些数据转化成我们需要的格式,比如使用字典来表示数据:

def create_dataset():
    dataset = []
    for item in data:
        temp_dict = {}
        for i in range(len(item)):
            temp_dict[labels[i]] = item[i]
        dataset.append(temp_dict)
    return dataset

然后我们需要选择合适的特征来建立决策树,这里我们使用ID3算法。

import math

def calc_entropy(data):
    labelCounts = {}
    numEntries = len(data)
    for featureVec in data:
        currentLabel = featureVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    entropy = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        entropy -= prob * math.log(prob, 2)
    return entropy

def split_dataset(data, axis, value):
    reduced_data = []
    for featureVec in data:
        if featureVec[axis] == value:
            reduced_featureVec = featureVec[:axis]
            reduced_featureVec.extend(featureVec[axis+1:])
            reduced_data.append(reduced_featureVec)
    return reduced_data

def choose_best_feature(data):
    numFeatures = len(data[0])-1
    baseEntropy = calc_entropy(data)
    bestGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in data]
        uniqueVals = set(featList)
        newEntropy = 0.0
        for value in uniqueVals:
            subData = split_dataset(data, i, value)
            prob = len(subData)/float(len(data))
            newEntropy += prob * calc_entropy(subData)
        infoGain = baseEntropy - newEntropy
        if (infoGain > bestGain):
            bestGain = infoGain
            bestFeature = i
    return bestFeature

接下来我们需要递归地建立决策树,这里我们使用字典来表示决策树:

def createTree(data, labels):
    classList = [example[-1] for example in data]
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if len(data[0]) == 1:
        return majorityCnt(classList)
    bestFeat = choose_best_feature(data)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in data]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(split_dataset\
            (data, bestFeat, value), subLabels)
    return myTree

最后,我们使用建立好的决策树来预测新的数据:

def classify(inputTree, featLabels, testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(secondDict[key], featLabels, testVec)
            else:
                classLabel = secondDict[key]
    return classLabel

完整代码如下:

import math

data = [['皮蛋', '男', '是'],
    ['皮蛋', '女', '是'],
    ['皮蛋编程', '男', '是'],
    ['皮蛋编程', '女', '否']]
labels = ['姓名', '性别']

def create_dataset():
    dataset = []
    for item in data:
        temp_dict = {}
        for i in range(len(item)):
            temp_dict[labels[i]] = item[i]
        dataset.append(temp_dict)
    return dataset

def calc_entropy(data):
    labelCounts = {}
    numEntries = len(data)
    for featureVec in data:
        currentLabel = featureVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    entropy = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        entropy -= prob * math.log(prob, 2)
    return entropy

def split_dataset(data, axis, value):
    reduced_data = []
    for featureVec in data:
        if featureVec[axis] == value:
            reduced_featureVec = featureVec[:axis]
            reduced_featureVec.extend(featureVec[axis+1:])
            reduced_data.append(reduced_featureVec)
    return reduced_data

def choose_best_feature(data):
    numFeatures = len(data[0])-1
    baseEntropy = calc_entropy(data)
    bestGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in data]
        uniqueVals = set(featList)
        newEntropy = 0.0
        for value in uniqueVals:
            subData = split_dataset(data, i, value)
            prob = len(subData)/float(len(data))
            newEntropy += prob * calc_entropy(subData)
        infoGain = baseEntropy - newEntropy
        if (infoGain > bestGain):
            bestGain = infoGain
            bestFeature = i
    return bestFeature

def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

def createTree(data, labels):
    classList = [example[-1] for example in data]
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if len(data[0]) == 1:
        return majorityCnt(classList)
    bestFeat = choose_best_feature(data)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in data]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(split_dataset\
            (data, bestFeat, value), subLabels)
    return myTree

def classify(inputTree, featLabels, testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(secondDict[key], featLabels, testVec)
            else:
                classLabel = secondDict[key]
    return classLabel

if __name__ == '__main__':
    dataset = create_dataset()
    labels = ['姓名', '性别']
    decision_tree = createTree(dataset, labels)
    print(decision_tree)
    testVec = ['皮蛋编程', '男']
    result = classify(decision_tree, labels, testVec)
    print(result)

输出结果为:

{'姓名': {'皮蛋编程': {'性别': {'女': '否', '男': '是'}}}}

说明“皮蛋编程”这个人是男性,而且会编程。

相关文章