Commit a5c0a5a8 authored by Uwe Köckemann's avatar Uwe Köckemann
Browse files

replaced didi-x with didi, cleaner mt factory, notebooks extended

parent e135452d
......@@ -8,10 +8,8 @@
},
"outputs": [],
"source": [
"from abc import ABC, abstractmethod\n",
"\n",
"import math\n",
"import random as r\n",
"import os\n",
"import sys\n",
"\n",
"import matplotlib.pyplot as plt\n",
"plt.style.use('seaborn-whitegrid')\n",
......@@ -19,24 +17,13 @@
"\n",
"from aiddl_core.representation.symbolic import Symbolic as Sym\n",
"from aiddl_core.representation.variable import Variable as Var\n",
"from aiddl_core.representation.integer import Integer\n",
"from aiddl_core.representation.real import Real\n",
"from aiddl_core.representation.infinity import Infinity\n",
"from aiddl_core.representation.tuple import Tuple\n",
"from aiddl_core.representation.list import List\n",
"from aiddl_core.representation.substitution import Substitution\n",
"from aiddl_core.container.container import Container\n",
"from aiddl_core.tools.combo_iterator import ComboIterator\n",
"from aiddl_core.function.default import get_default_function_registry\n",
"from aiddl_core.tools.logger import Logger\n",
"from aiddl_core.function.default import get_default_function_registry\n",
"from aiddl_core.parser.parser import parse_term as parse\n",
"\n",
"from aiddl_network.grpc_function import GrpcFunction\n",
"from aiddl_network.aiddl_grpc_server import AiddlServicer\n",
"from aiddl_network.aiddl_grpc_server import LOADER_URI\n",
"\n",
"import os\n",
"import sys\n",
"module_path = os.path.abspath(os.path.join('../python/moving_target'))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
......@@ -46,8 +33,7 @@
"from moving_target_ortools import MovingTargetRegOT\n",
"from moving_target_ortools import MovingTargetClsOT\n",
"\n",
"from factory import get_problem\n",
"from preprocessing import OneHotEncoder\n",
"from factory import moving_targets_factory\n",
"from tools import CsvLoader\n",
"from utils import didi_r\n",
"from scikit_learn_wrapper import SciKitLearnFunction\n",
......@@ -125,7 +111,7 @@
"source": [
"# Without Moving Targets\n",
"\n",
"Fairness and scores without moving targets."
"Fairness and score after a single learning step (without moving targets). Note the DIDI calculation protects the the last non-label attribute in our data set."
]
},
{
......@@ -140,8 +126,8 @@
"output_type": "stream",
"text": [
"DIDI (pre-learning): 0.18046885714845506\n",
"DIDI (post-learning): 0.22806466902039563\n",
"MSE (test data): 0.22806466902039563\n"
"DIDI (post-learning): 0.2278699416515176\n",
"MSE (test data): 0.007579562360896998\n"
]
}
],
......@@ -159,14 +145,14 @@
"\n",
"f_ML = SciKitLearnFunction(learner)\n",
"F.add_function(Sym(\"mt.learner\"), f_ML)\n",
"f_ML.apply(train_data)\n",
"f_ML(train_data)\n",
"y_0 = f_ML.predict(x_test)\n",
"\n",
"didi_0 = didi_r(np.array(x_test), np.array(y_0), [len(x[0])-1])\n",
"mse_0 = mean_squared_error(y_0, y_test)\n",
"\n",
"print(\"DIDI (post-learning):\", didi_0)\n",
"print(\"MSE (test data):\", didi_0)\n"
"print(\"MSE (test data):\", mse_0)\n"
]
},
{
......@@ -175,22 +161,33 @@
"source": [
"# Moving Targets\n",
"\n",
"## Parameters"
"## Parameters\n",
"\n",
"Below we set the following parameters to configure moving targets:\n",
"\n",
"- **n**: Maximum number of iterations\n",
"- **alpha**: Parameter alpha\n",
"- **beta**: Parameter alpha\n",
"- **problem-type**: *classification* or *regression*\n",
"- **constraint-solver**: *cplex*, *ortools*, or *pysmt*\n",
"- **learner**: Symboli name of machine learning function\n",
"\n",
"The corresponding AIDDL term is a tuple *(k1:v2 k2:v2)* of key value pairs *key:value*. Here we parse the information from a string but it can also be loaded from an entry in an AIDDL file."
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"cfg = parse('''(\n",
" n:10 ;; Number of iterations\n",
" alpha:1.0 ;; Parameter alpha\n",
" beta:1.0 ;; Parameter beta\n",
" problem-type:regression ;; classification/regression?\n",
" constraint-solver:ortools ;; Select constraint solver (cplex/pysmt/ortools)\n",
" learner:mt.learner ;; URI of learner (registered above)\n",
" n:10 \n",
" alpha:1.0 \n",
" beta:1.0 \n",
" problem-type:regression \n",
" constraint-solver:ortools \n",
" learner:mt.learner\n",
")''')"
]
},
......@@ -198,22 +195,32 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Input Constraints"
"## Loss Function, Constraints and Data\n",
"\n",
"- **loss-function**: \n",
" - **regression**: *MeanAbsoluteError* or *MeanSquaredError*\n",
" - **classification**: *HammingDistance*\n",
"- **constraints**: AIDDL set *{c1 ...}* of constraints\n",
" - **(< y c)**: attribute *y* should be less than constant *c* (same for <=, >, >=)\n",
" - **(balance y q)**: attribute *y* balanced with unbalance at most *q*\n",
" - **(didi y [x1 x2] q)**: model for *y* should be fair wrt. *x1* and *x2* with unfairness at most *q* \n",
"\n",
"The last line below adds the converted data set with fields *data*, *label*, and *attributes*. As a result *mt_data* will contain all information required by machine learning, constraint solver, and moving targets in a single set. Note that despite the additional information, *mt_data* could still be used directly as input to a machine learning algorithm. "
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"data = parse('''{\n",
" loss-function:MeanAbsoluteError ;; Select loss function\n",
"mt_data = parse('''{\n",
" loss-function:MeanAbsoluteError\n",
" constraints:{\n",
" (didi-real \"violentPerPop\" [\"race\"] 0.01)\n",
" (didi \"violentPerPop\" [\"race\"] 0.01)\n",
" }\n",
"}''')\n",
"data = data.put_all(train_data)"
"mt_data = mt_data.put_all(train_data)"
]
},
{
......@@ -222,16 +229,16 @@
"source": [
"## Create Instance of Moving Targets\n",
"\n",
"Factory method is used to create a version of moving targets for the specified problem type and with the selected constraint solver."
"Factory method is used to create a version of moving targets for the specified problem type and with the selected constraint solver. We also provide a function registry *F* that allows us to look up the learning function based on the symbolic name (*mt.learner* above), as well as the test data that will be used on each iteration of moving targets to track performance on non-training data."
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"f_MT = get_problem(str(cfg.get(Sym(\"problem-type\"))), solver=str(cfg.get(Sym(\"constraint-solver\"))))(cfg, F, test_data=x_test)"
"f_MT = moving_targets_factory(cfg, F, test_data=x_test)"
]
},
{
......@@ -243,7 +250,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 10,
"metadata": {},
"outputs": [
{
......@@ -251,8 +258,8 @@
"output_type": "stream",
"text": [
"Initializing model\n",
"('didi-real', 'violentPerPop', [15], 0.01)\n",
"Constraint added: ('didi-real', 'violentPerPop', [15], 0.01)\n",
"('didi', 'violentPerPop', [15], 0.01)\n",
"Constraint added: ('didi', 'violentPerPop', [15], 0.01)\n",
"Constraint satisfaction: False\n",
"Solving\n",
"Constraint satisfaction: False\n",
......@@ -277,39 +284,23 @@
}
],
"source": [
"y_k = f_MT.apply(data)"
"y_k = f_MT(mt_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Print Results"
"## Test Results\n",
"\n",
"Below we print the MSE and DIDI on the test data for each iteration. We would expect fairness to increate (i.e., lower DIDI value) while the MSE increases as a trade-off."
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"n MSE DIDI\n",
"0 0.0076 0.2282\n",
"1 0.0090 0.0408\n",
"2 0.0093 0.0275\n",
"3 0.0095 0.0270\n",
"4 0.0098 0.0381\n",
"5 0.0096 0.0405\n",
"6 0.0097 0.0425\n",
"7 0.0100 0.0398\n",
"8 0.0277 0.0000\n",
"9 0.0277 0.0000\n"
]
}
],
"outputs": [],
"source": [
"i = 0\n",
"\n",
......
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"import matplotlib.pyplot as plt\n",
"plt.style.use('seaborn-whitegrid')\n",
"import numpy as np\n",
"\n",
"from aiddl_core.representation.symbolic import Symbolic as Sym\n",
"from aiddl_core.representation.variable import Variable as Var\n",
"from aiddl_core.representation.substitution import Substitution\n",
"from aiddl_core.container.container import Container\n",
"from aiddl_core.tools.combo_iterator import ComboIterator\n",
"from aiddl_core.tools.logger import Logger\n",
"from aiddl_core.function.default import get_default_function_registry\n",
"from aiddl_core.parser.parser import parse_term as parse\n",
"\n",
"module_path = os.path.abspath(os.path.join('../python/moving_target'))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from moving_target_cplex import MovingTargetRegCplex\n",
"from moving_target_cplex import MovingTargetClsCplex\n",
"from moving_target_ortools import MovingTargetRegOT\n",
"from moving_target_ortools import MovingTargetClsOT\n",
"\n",
"from factory import moving_targets_factory\n",
"from tools import CsvLoader\n",
"from utils import didi_c\n",
"from scikit_learn_wrapper import SciKitLearnFunction\n",
"from scikit_learn_wrapper import splitAiddlMlProblem\n",
"\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.metrics import mean_squared_error"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load Data\n",
"\n",
"Create container and function registry and load some example data from a local file."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Label: \"income\"\n",
"Attributes: [\n",
" (\"age\" org.aiddl.term)\n",
" (\"fnlwgt\" org.aiddl.term)\n",
" (\"education-num\" org.aiddl.term)\n",
" (\"capital-gain\" org.aiddl.term)\n",
" (\"capital-loss\" org.aiddl.term)\n",
" (\"hours-per-week\" org.aiddl.term)\n",
" (\"workclass_Federal-gov\" org.aiddl.term)\n",
" (\"workclass_Local-gov\" org.aiddl.term)\n",
" (\"workclass_Private\" org.aiddl.term)\n",
" (\"workclass_Self-emp-inc\" org.aiddl.term)\n",
" (\"workclass_Self-emp-not-inc\" org.aiddl.term)\n",
" (\"workclass_State-gov\" org.aiddl.term)\n",
" (\"workclass_Without-pay\" org.aiddl.term)\n",
" (\"marital-status_Divorced\" org.aiddl.term)\n",
" (\"marital-status_Married-AF-spouse\" org.aiddl.term)\n",
" (\"marital-status_Married-civ-spouse\" org.aiddl.term)\n",
" (\"marital-status_Married-spouse-absent\" org.aiddl.term)\n",
" (\"marital-status_Never-married\" org.aiddl.term)\n",
" (\"marital-status_Separated\" org.aiddl.term)\n",
" (\"marital-status_Widowed\" org.aiddl.term)\n",
" (\"occupation_Adm-clerical\" org.aiddl.term)\n",
" (\"occupation_Armed-Forces\" org.aiddl.term)\n",
" (\"occupation_Craft-repair\" org.aiddl.term)\n",
" (\"occupation_Exec-managerial\" org.aiddl.term)\n",
" (\"occupation_Farming-fishing\" org.aiddl.term)\n",
" (\"occupation_Handlers-cleaners\" org.aiddl.term)\n",
" (\"occupation_Machine-op-inspct\" org.aiddl.term)\n",
" (\"occupation_Other-service\" org.aiddl.term)\n",
" (\"occupation_Priv-house-serv\" org.aiddl.term)\n",
" (\"occupation_Prof-specialty\" org.aiddl.term)\n",
" (\"occupation_Protective-serv\" org.aiddl.term)\n",
" (\"occupation_Sales\" org.aiddl.term)\n",
" (\"occupation_Tech-support\" org.aiddl.term)\n",
" (\"occupation_Transport-moving\" org.aiddl.term)\n",
" (\"relationship_Husband\" org.aiddl.term)\n",
" (\"relationship_Not-in-family\" org.aiddl.term)\n",
" (\"relationship_Other-relative\" org.aiddl.term)\n",
" (\"relationship_Own-child\" org.aiddl.term)\n",
" (\"relationship_Unmarried\" org.aiddl.term)\n",
" (\"relationship_Wife\" org.aiddl.term)\n",
" (\"sex_Female\" org.aiddl.term)\n",
" (\"sex_Male\" org.aiddl.term)\n",
" (\"race_Amer-Indian-Eskimo\" org.aiddl.term)\n",
" (\"race_Asian-Pac-Islander\" org.aiddl.term)\n",
" (\"race_Black\" org.aiddl.term)\n",
" (\"race_Other\" org.aiddl.term)\n",
" (\"race_White\" org.aiddl.term)\n",
" (\"income\" org.aiddl.term)\n",
" ]\n",
"\n",
"Data (first row): [0.4931506849315069 0.104543595947006 0.26666666666666666 0.0 0.0 0.7040816326530611 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0]\n"
]
}
],
"source": [
"C = Container()\n",
"F = get_default_function_registry(C)\n",
"\n",
"loader = CsvLoader()\n",
"\n",
"test_data = loader.apply(parse('(\"../resources/adult_test.csv\" \",\" \"income\")'))\n",
"train_data = loader.apply(parse('(\"../resources/adult_train.csv\" \",\" \"income\")'))\n",
"\n",
"x, y = splitAiddlMlProblem(train_data)\n",
"x_test, y_test = splitAiddlMlProblem(test_data)\n",
"\n",
"print(\"Label:\", train_data[Sym(\"label\")])\n",
"print(\"Attributes:\", Logger.pretty_print(train_data[Sym(\"attributes\")], 1))\n",
"print(\"Data (first row):\", train_data[Sym(\"data\")][0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Without Moving Targets\n",
"\n",
"Fairness and score after a single learning step (without moving targets)."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DIDI (pre-learning): 1.3178840744255598\n",
"DIDI (post-learning): 1.23095753493759\n",
"ACC (test data): 0.8536383225592574\n"
]
}
],
"source": [
"didi_init = didi_c(np.array(x), np.array(y), [45, 44, 43, 42, 41])\n",
"\n",
"print(\"DIDI (pre-learning):\", didi_init)\n",
"\n",
"learner = parse('''{\n",
" py_module:sklearn.ensemble\n",
" py_class:RandomForestClassifier\n",
" n_estimators:50\n",
" max_depth:5\n",
"}''')\n",
"\n",
"f_ML = SciKitLearnFunction(learner)\n",
"F.add_function(Sym(\"mt.learner\"), f_ML)\n",
"f_ML(train_data)\n",
"y_0 = f_ML.predict(x_test)\n",
"\n",
"didi_0 = didi_c(np.array(x_test), np.array(y_0), [45, 44, 43, 42, 41])\n",
"acc_0 = accuracy_score(y_0, y_test)\n",
"\n",
"print(\"DIDI (post-learning):\", didi_0)\n",
"print(\"ACC (test data):\", acc_0)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Moving Targets\n",
"\n",
"## Parameters\n",
"\n",
"Below we set the following parameters to configure moving targets:\n",
"\n",
"- **n**: Maximum number of iterations\n",
"- **alpha**: Parameter alpha\n",
"- **beta**: Parameter alpha\n",
"- **problem-type**: *classification* or *regression*\n",
"- **constraint-solver**: *cplex*, *ortools*, or *pysmt*\n",
"- **learner**: Symboli name of machine learning function\n",
"\n",
"The corresponding AIDDL term is a tuple *(k1:v2 k2:v2)* of key value pairs *key:value*. Here we parse the information from a string but it can also be loaded from an entry in an AIDDL file."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"cfg = parse('''(\n",
" n:10 \n",
" alpha:1.0 \n",
" beta:1.0 \n",
" problem-type:classification \n",
" constraint-solver:ortools \n",
" learner:mt.learner\n",
")''')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loss Function, Constraints and Data\n",
"\n",
"- **loss-function**: \n",
" - **regression**: *MeanAbsoluteError* or *MeanSquaredError*\n",
" - **classification**: *HammingDistance*\n",
"- **constraints**: AIDDL set *{c1 ...}* of constraints\n",
" - **(< y c)**: attribute *y* should be less than constant *c* (same for <=, >, >=)\n",
" - **(balance y q)**: attribute *y* balanced with unbalance at most *q*\n",
" - **(didi y [x1 x2] q)**: model for *y* should be fair wrt. *x1* and *x2* with unfairness at most *q* \n",
"\n",
"The last line below adds the converted data set with fields *data*, *label*, and *attributes*. As a result *mt_data* will contain all information required by machine learning, constraint solver, and moving targets in a single set. Note that despite the additional information, *mt_data* could still be used directly as input to a machine learning algorithm. "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"mt_data = parse('''{\n",
" loss-function:HammingDistance\n",
" constraints:{\n",
" (didi \"income\" [\"race_Amer-Indian-Eskimo\" \"race_Asian-Pac-Islander\" \"race_Black\" \"race_Other\" \"race_White\"] 0.01)\n",
" }\n",
"}''')\n",
"mt_data = mt_data.put_all(train_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Instance of Moving Targets\n",
"\n",
"Factory method is used to create a version of moving targets for the specified problem type and with the selected constraint solver. We also provide a function registry *F* that allows us to look up the learning function based on the symbolic name (*mt.learner* above), as well as the test data that will be used on each iteration of moving targets to track performance on non-training data."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"f_MT = moving_targets_factory(cfg, F, test_data=x_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Apply Moving Targets to Input"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"False\n",
"Constraint added: FairnessConstraint: didi <= 0.01\n",
"Constraint satisfaction: False\n",
"Solving\n",
"Constraint satisfaction: True\n",
"Solving\n",
"Constraint satisfaction: True\n",
"Solving\n",
"Constraint satisfaction: True\n",
"Solving\n",
"Constraint satisfaction: True\n",
"Solving\n",
"Constraint satisfaction: True\n",
"Solving\n",
"Constraint satisfaction: True\n",
"Solving\n",
"Constraint satisfaction: True\n",
"Solving\n",
"Constraint satisfaction: True\n",
"Solving\n",
"Constraint satisfaction: True\n",
"Solving\n"
]
}
],
"source": [
"y_k = f_MT(mt_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test Results\n",
"\n",
"Below we print the MSE and DIDI on the test data for each iteration. We would expect fairness to increate (i.e., lower DIDI value) while the MSE increases as a trade-off."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"n ACC DIDI\n",
"0 0.8556 1.2272\n",
"1 0.2536 0.0000\n",
"2 0.2536 0.0000\n",
"3 0.2536 0.0000\n",
"4 0.2536 0.0000\n",
"5 0.2536 0.0000\n",
"6 0.2536 0.0000\n",
"7 0.2536 0.0000\n",
"8 0.2536 0.0000\n",
"9 0.2536 0.0000\n"
]
}
],
"source": [
"i = 0\n",
"\n",
"print(\"n ACC DIDI\")\n",
"for y_k in f_MT.y_k_test_history:\n",
" acc_k = accuracy_score(y_k, y_test)\n",
" didi_k = didi_c(np.array(x_test), y_k, [45, 44, 43, 42, 41])\n",
" print(\"%d %.4f %.4f\" % (i, acc_k, didi_k))\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",