From 437f78836cfc0887cfd9d3b13be25cf32d3b4996 Mon Sep 17 00:00:00 2001 From: op07n <39182105+op07n@users.noreply.github.com> Date: Tue, 21 Jan 2020 16:31:03 +0100 Subject: [PATCH] Delete DTree-Classifier.ipynb --- apps/DTree-Classifier.ipynb | 141 ------------------------------------ 1 file changed, 141 deletions(-) delete mode 100644 apps/DTree-Classifier.ipynb diff --git a/apps/DTree-Classifier.ipynb b/apps/DTree-Classifier.ipynb deleted file mode 100644 index 502ae51..0000000 --- a/apps/DTree-Classifier.ipynb +++ /dev/null @@ -1,141 +0,0 @@ -# import streamlit as st -import pandas as pd - - -def unique_values(rows, col): - return set([row[col] for row in rows]) - -def class_counts(rows): - counts = {} - for row in rows: - label = row[-1] - if label not in counts: - counts[label] = 0 - counts[label]+=1 - return counts - - -def is_numeric(value): - return isinstance(value, int) or isinstance(value, float) - - -class Question: - def __init__(self, column, value): - self.column = column - self.value = value - - def match(self, example): - val = example[self.column] - if is_numeric(val): - return val >= self.value - else: - return val == self.value - - def __repr__(self): - condition = "==" - if is_numeric(self.value): - condition=">=" - return f'Is {header[self.column]} {condition} {str(self.value)} ?' - - -def partition(rows, question): - true_rows, false_rows = [], [] - for row in rows: - if question.match(row): - true_rows.append(row) - else: - false_rows.append(row) - return true_rows, false_rows - -def gini(rows): - counts = class_counts(rows) - impurity = 1 - for lbl in counts: - prob_of_lbl = counts[lbl] / float(len(rows)) - impurity -=prob_of_lbl**2 - return impurity - - -def info_gain(left, right, current_uncertainty): - p = float(len(left)) / (len(left) + len(right)) - return current_uncertainty - p * gini(left) - (1 - p) * gini(right) - -def find_best_split(rows): - best_gain = 0 - best_question = None - current_uncertainty = gini(rows) - n_features = len(rows[0]) - 1 - - for col in range(n_features): - values = set([row[col] for row in rows]) - for val in values: - question = Question(col, val) - true_rows, false_rows = partition(rows, question) - if len(true_rows) == 0 or len(false_rows) == 0: - continue - gain = info_gain(true_rows, false_rows, current_uncertainty) - if gain>= best_gain: - best_gain, best_question = gain, question - return best_gain, best_question - -class Leaf: - def __init__(self, rows): - self.predictions = class_counts(rows) - -class Decision_Node: - def __init__(self, question, true_branch, false_branch): - self.question = question - self.true_branch = true_branch - self.false_branch = false_branch - -def build_tree(rows): - gain, question = find_best_split(rows) - if gain == 0: - return Leaf(rows) - - true_rows, false_rows = partition(rows, question) - true_branch = build_tree(true_rows) - false_branch = build_tree(false_rows) - - return Decision_Node(question, true_branch, false_branch) - -def print_tree(node, spacing=''): - if isinstance(node, Leaf): - print(spacing + 'Predict', node.predictions) - return - print(spacing + str(node.question)) - print(spacing + '--> True:') - print_tree(node.true_branch, spacing + ' ') - print(spacing + '--> False:') - print_tree(node.false_branch, spacing + ' ') - - -def classify(row, node): - if isinstance(node, Leaf): - return node.predictions - if node.question.match(row): - return classify(row, node.true_branch) - else: - return classify(row, node.false_branch) - - -def print_leaf(counts): - total = sum(counts.values()) * 1.0 - probs = {} - for lbl in counts.keys(): - probs[lbl] = str(int(counts[lbl] / total * 100))+'%' - return probs - - -def predict(data, header): - header = header - tree = build_tree(data) - results = {} - for row in data: - # print(f'Actual: {row[-1]}. Predicted: {print_leaf(classify(row, tree))}') - results[row[-1]] = print_leaf(classify(row, tree)) - return results, tree - -# results, tree = predict(data, header) -# print(results) -# print_tree(tree)