机器学习实战

使用AdaBoost来预测患有疝气病的马的存活问题

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
结果示例
在这里插入图片描述完整代码

# -*- coding: utf-8 -*-
# @Time    : 2021/6/21 15:33
# @Author  : weiwei
# @File    : Horse.py
import numpy as np
from math import log


def loadSimpData():
    datMat = np.matrix([
        [1., 2.1],
        [2, 1.1],
        [1.3, 1.],
        [1., 1.],
        [2., 1.]
    ])
    classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return datMat, classLabels


def loadDataSet(fileName):
    numFeat = len(open(fileName).readline().split('\t'))
    dataMat = [];
    labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr = []
        curLine = line.strip().split('\t')
        for i in range(numFeat - 1):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat, labelMat


def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
    retArray = np.ones((np.shape(dataMatrix)[0], 1))
    if threshIneq == 'lt':
        retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:, dimen] <= threshVal] = 1.0
    return retArray


def buildStump(dataArr, classLabels, D):
    dataMatrix = np.mat(dataArr)
    labelMat = np.mat(classLabels).T
    m, n = np.shape(dataMatrix)
    numSteps = 10.0;
    bestStump = {};
    bestClassEst = np.mat(np.zeros((m, 1)))
    minError = np.inf
    for i in range(n):
        rangeMin = dataMatrix[:, i].min();
        rangeMax = dataMatrix[:, i].max();
        stepSize = (rangeMax - rangeMin) / numSteps
        for j in range(-1, int(numSteps) + 1):

            for inequal in ['lt', 'gt']:
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
                errArr = np.mat(np.ones((m, 1)))
                errArr[predictedVals == labelMat] = 0
                weightError = D.T * errArr
                print("the error rate of this test is %.3f" % (weightError))
                if weightError < minError:
                    minError = weightError
                    bestClassEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump, minError, bestClassEst


def adaBoostTrainDS(dataArr, classLabels, numIt=40):
    weakClassArr = []
    m = np.shape(dataArr)[0]
    D = np.mat(np.ones((m, 1)) / m)
    aggClassEst = np.mat(np.zeros((m, 1)))
    for i in range(numIt):
        bestStump, error, classEst = buildStump(dataArr, classLabels, D)
        print("D:", D.T)

        alpha = float(0.5 * log((1.0 - error) / max(error, 1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        print("classEst: ", classEst.T)

        expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst)
        D = np.multiply(D, np.exp(expon))
        D = D / D.sum()
        aggClassEst += alpha * classEst
        print("aggClassEst: ", aggClassEst.T)
        aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1)))
        errorRate = aggErrors.sum() / m
        print("total error: ", errorRate)
        if errorRate == 0.0: break
    return weakClassArr, aggClassEst


def adaClassify(datToClass, classifierArr):
    dataMatrix = np.mat(datToClass)
    m = np.shape(dataMatrix)[0]
    aggClassEst = np.mat(np.zeros((m, 1)))
    for i in range(len(classifierArr)):
        print("the error rate of this test is %.6f" % (classifierArr[i]['alpha']))

        classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], \
                                 classifierArr[i]['thresh'], \
                                 classifierArr[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha'] * classEst
    return np.sign(aggClassEst)


if __name__ == '__main__':
    datMat, classLabels = loadSimpData()
    D = np.mat(np.ones((5, 1)) / 5)
    buildStump(datMat, classLabels, D)
    classifierArr, aggClassEst = adaBoostTrainDS(datMat, classLabels)
    print(adaClassify([[0, 0], [1, 0], [2, 2]], classifierArr))

    datArr, labelArr = loadDataSet('horseColicTraining.txt')
    classifierArr, aggClassEst = adaBoostTrainDS(datArr, labelArr, 10)
    testArr, testLableArr = loadDataSet('horseColicTest.txt')
    prediction = adaClassify(testArr, classifierArr)
    errArr = np.mat(np.ones((67, 1)))
    errRate = errArr[prediction != np.mat(testLableArr).T].sum() / 67
    print("after 10 iterations the average error rate is: %f" % (errRate))

数据集相关资料可以从Machine-learning中找到
在这里插入图片描述

Logo

汇聚全球AI编程工具,助力开发者即刻编程。

更多推荐