导入所需的Python库
import numpy as np import pandas as pd eps = np.finfo(float).eps from numpy import log2 as log
'eps'这里是最小的可表示数字。
创建机器学习数据集:
outlook = 'overcast,overcast,overcast,overcast,rainy,rainy,rainy,rainy,rainy,sunny,sunny,sunny,sunny,sunny'.split(',') temp = 'hot,cool,mild,hot,mild,cool,cool,mild,mild,hot,hot,mild,cool,mild'.split(',') humidity = 'high,normal,high,normal,high,normal,normal,normal,high,high,high,high,normal,normal'.split(',') windy = 'FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,TRUE'.split(',') play = 'yes,yes,yes,yes,yes,yes,no,yes,no,no,no,no,yes,yes'.split(',')
创建pandas dataframe:
dataset ={'outlook':outlook,'temp':temp,'humidity':humidity,'windy':windy,'play':play} df = pd.DataFrame(dataset,columns=['outlook','temp','humidity','windy','play'])
现在让我们尝试创建决策树的步骤
- 计算数据集的熵
- 对于每个属性/特征:
- 计算所有分类值的熵
- 取当前属性的平均信息熵
- 计算当前属性的增益
- 选择最高增益属性。
- 重复这个过程,直到得到我们想要的树
1.找出拆分数据集的熵和信息增益。
我们将定义一个接受类(目标变量向量)的函数并找到该类的熵。
这里的分数是'pi',它是拆分组中的元素数量与拆分前(父组)组中的元素数量的比例。
##1. claculate entropy o the whole dataset entropy_node = 0 #Initialize Entropy values = df.play.unique() #Unique objects - 'Yes', 'No' for value in values: fraction = df.play.value_counts()[value]/len(df.play) entropy_node += -fraction*np.log2(fraction)
2.现在定义一个函数{ent}来计算每个属性的熵:
def ent(df,attribute): target_variables = df.play.unique() #This gives all 'Yes' and 'No' variables = df[attribute].unique() #This gives different features in that attribute (like 'Sweet') entropy_attribute = 0 for variable in variables: entropy_each_feature = 0 for target_variable in target_variables: num = len(df[attribute][df[attribute]==variable][df.play ==target_variable]) #numerator den = len(df[attribute][df[attribute]==variable]) #denominator fraction = num/(den+eps) #pi entropy_each_feature += -fraction*log(fraction+eps) #This calculates entropy for one feature like 'Sweet' fraction2 = den/len(df) entropy_attribute += -fraction2*entropy_each_feature #Sums up all the entropy ETaste return(abs(entropy_attribute))
使用其名称存储每个属性的熵:
a_entropy = {k:ent(df,k) for k in df.keys()[:-1]} a_entropy
3.计算每个属性的信息增益:
定义一个计算IG(infogain)的函数:
IG(attr)=数据集的熵 - 属性的熵
def ig(e_dataset,e_attr): return(e_dataset-e_attr)
将每个attr的IG存储在一个字典中:
#entropy_node = entropy of dataset #a_entropy[k] = entropy of k(th) attr IG = {k:ig(entropy_node,a_entropy[k]) for k in a_entropy}
我们可以看到outlook的信息增益最高,为0.24,所以我们选择outlook作为这一层的节点进行拆分。
现在继续我们的树,我们将使用递归
对子树重复同样的事情直到我们得到树。
我们基于此构建决策树。以下是Python代码。
def find_entropy(df): Class = df.keys()[-1] #To make the code generic, changing target variable class name entropy = 0 values = df[Class].unique() for value in values: fraction = df[Class].value_counts()[value]/len(df[Class]) entropy += -fraction*np.log2(fraction) return entropy def find_entropy_attribute(df,attribute): Class = df.keys()[-1] #To make the code generic, changing target variable class name target_variables = df[Class].unique() #This gives all 'Yes' and 'No' variables = df[attribute].unique() #This gives different features in that attribute (like 'Hot','Cold' in Temperature) entropy2 = 0 for variable in variables: entropy = 0 for target_variable in target_variables: num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable]) den = len(df[attribute][df[attribute]==variable]) fraction = num/(den+eps) entropy += -fraction*log(fraction+eps) fraction2 = den/len(df) entropy2 += -fraction2*entropy return abs(entropy2) def find_winner(df): Entropy_att = [] IG = [] for key in df.keys()[:-1]: # Entropy_att.append(find_entropy_attribute(df,key)) IG.append(find_entropy(df)-find_entropy_attribute(df,key)) return df.keys()[:-1][np.argmax(IG)] def get_subtable(df, node,value): return df[df[node] == value].reset_index(drop=True) def buildTree(df,tree=None): Class = df.keys()[-1] #To make the code generic, changing target variable class name #Here we build our decision tree #Get attribute with maximum information gain node = find_winner(df) #Get distinct value of that attribute e.g Salary is node and Low,Med and High are values attValue = np.unique(df[node]) #Create an empty dictionary to create tree if tree is None: tree={} tree[node] = {} #We make loop to construct a tree by calling this function recursively. #In this we check if the subset is pure and stops if it is pure. for value in attValue: subtable = get_subtable(df,node,value) clValue,counts = np.unique(subtable['Eat'],return_counts=True) if len(counts)==1:#Checking purity of subset tree[node][value] = clValue[0] else: tree[node][value] = buildTree(subtable) #Calling the function recursively return tree