Delete DTree-Classifier.ipynb
This commit is contained in:
parent
e6d1b4c053
commit
437f78836c
|
@ -1,141 +0,0 @@
|
||||||
# import streamlit as st
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def unique_values(rows, col):
|
|
||||||
return set([row[col] for row in rows])
|
|
||||||
|
|
||||||
def class_counts(rows):
|
|
||||||
counts = {}
|
|
||||||
for row in rows:
|
|
||||||
label = row[-1]
|
|
||||||
if label not in counts:
|
|
||||||
counts[label] = 0
|
|
||||||
counts[label]+=1
|
|
||||||
return counts
|
|
||||||
|
|
||||||
|
|
||||||
def is_numeric(value):
|
|
||||||
return isinstance(value, int) or isinstance(value, float)
|
|
||||||
|
|
||||||
|
|
||||||
class Question:
|
|
||||||
def __init__(self, column, value):
|
|
||||||
self.column = column
|
|
||||||
self.value = value
|
|
||||||
|
|
||||||
def match(self, example):
|
|
||||||
val = example[self.column]
|
|
||||||
if is_numeric(val):
|
|
||||||
return val >= self.value
|
|
||||||
else:
|
|
||||||
return val == self.value
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
condition = "=="
|
|
||||||
if is_numeric(self.value):
|
|
||||||
condition=">="
|
|
||||||
return f'Is {header[self.column]} {condition} {str(self.value)} ?'
|
|
||||||
|
|
||||||
|
|
||||||
def partition(rows, question):
|
|
||||||
true_rows, false_rows = [], []
|
|
||||||
for row in rows:
|
|
||||||
if question.match(row):
|
|
||||||
true_rows.append(row)
|
|
||||||
else:
|
|
||||||
false_rows.append(row)
|
|
||||||
return true_rows, false_rows
|
|
||||||
|
|
||||||
def gini(rows):
|
|
||||||
counts = class_counts(rows)
|
|
||||||
impurity = 1
|
|
||||||
for lbl in counts:
|
|
||||||
prob_of_lbl = counts[lbl] / float(len(rows))
|
|
||||||
impurity -=prob_of_lbl**2
|
|
||||||
return impurity
|
|
||||||
|
|
||||||
|
|
||||||
def info_gain(left, right, current_uncertainty):
|
|
||||||
p = float(len(left)) / (len(left) + len(right))
|
|
||||||
return current_uncertainty - p * gini(left) - (1 - p) * gini(right)
|
|
||||||
|
|
||||||
def find_best_split(rows):
|
|
||||||
best_gain = 0
|
|
||||||
best_question = None
|
|
||||||
current_uncertainty = gini(rows)
|
|
||||||
n_features = len(rows[0]) - 1
|
|
||||||
|
|
||||||
for col in range(n_features):
|
|
||||||
values = set([row[col] for row in rows])
|
|
||||||
for val in values:
|
|
||||||
question = Question(col, val)
|
|
||||||
true_rows, false_rows = partition(rows, question)
|
|
||||||
if len(true_rows) == 0 or len(false_rows) == 0:
|
|
||||||
continue
|
|
||||||
gain = info_gain(true_rows, false_rows, current_uncertainty)
|
|
||||||
if gain>= best_gain:
|
|
||||||
best_gain, best_question = gain, question
|
|
||||||
return best_gain, best_question
|
|
||||||
|
|
||||||
class Leaf:
|
|
||||||
def __init__(self, rows):
|
|
||||||
self.predictions = class_counts(rows)
|
|
||||||
|
|
||||||
class Decision_Node:
|
|
||||||
def __init__(self, question, true_branch, false_branch):
|
|
||||||
self.question = question
|
|
||||||
self.true_branch = true_branch
|
|
||||||
self.false_branch = false_branch
|
|
||||||
|
|
||||||
def build_tree(rows):
|
|
||||||
gain, question = find_best_split(rows)
|
|
||||||
if gain == 0:
|
|
||||||
return Leaf(rows)
|
|
||||||
|
|
||||||
true_rows, false_rows = partition(rows, question)
|
|
||||||
true_branch = build_tree(true_rows)
|
|
||||||
false_branch = build_tree(false_rows)
|
|
||||||
|
|
||||||
return Decision_Node(question, true_branch, false_branch)
|
|
||||||
|
|
||||||
def print_tree(node, spacing=''):
|
|
||||||
if isinstance(node, Leaf):
|
|
||||||
print(spacing + 'Predict', node.predictions)
|
|
||||||
return
|
|
||||||
print(spacing + str(node.question))
|
|
||||||
print(spacing + '--> True:')
|
|
||||||
print_tree(node.true_branch, spacing + ' ')
|
|
||||||
print(spacing + '--> False:')
|
|
||||||
print_tree(node.false_branch, spacing + ' ')
|
|
||||||
|
|
||||||
|
|
||||||
def classify(row, node):
|
|
||||||
if isinstance(node, Leaf):
|
|
||||||
return node.predictions
|
|
||||||
if node.question.match(row):
|
|
||||||
return classify(row, node.true_branch)
|
|
||||||
else:
|
|
||||||
return classify(row, node.false_branch)
|
|
||||||
|
|
||||||
|
|
||||||
def print_leaf(counts):
|
|
||||||
total = sum(counts.values()) * 1.0
|
|
||||||
probs = {}
|
|
||||||
for lbl in counts.keys():
|
|
||||||
probs[lbl] = str(int(counts[lbl] / total * 100))+'%'
|
|
||||||
return probs
|
|
||||||
|
|
||||||
|
|
||||||
def predict(data, header):
|
|
||||||
header = header
|
|
||||||
tree = build_tree(data)
|
|
||||||
results = {}
|
|
||||||
for row in data:
|
|
||||||
# print(f'Actual: {row[-1]}. Predicted: {print_leaf(classify(row, tree))}')
|
|
||||||
results[row[-1]] = print_leaf(classify(row, tree))
|
|
||||||
return results, tree
|
|
||||||
|
|
||||||
# results, tree = predict(data, header)
|
|
||||||
# print(results)
|
|
||||||
# print_tree(tree)
|
|
Loading…
Reference in New Issue