Commit c6425cc0 authored by Fabrizio Detassis's avatar Fabrizio Detassis
Browse files

Merge branch 'master' of https://gitsvn-nt.oru.se/uwe.kockemann/moving-targets

 Conflicts:
	aiddl-project/python/moving_target/factory.py
	python/moving_target/moving_target_abc.py
	python/moving_target/moving_target_ortools.py
	python/moving_target/test_solver.py
parents f5ca5bc4 a5c0a5a8
......@@ -35,6 +35,6 @@ RUN sh cplex_studio1210.linux-x86-64.bin -f response.properties \
&& rm -rf /deps
WORKDIR /moving-target
RUN jupyter trust ./jupyter/Regression_Example.ipynb
RUN jupyter trust ./jupyter/*.ipynb
CMD ["./start.sh"]
\ No newline at end of file
* How to Install AIDDL
#+TITLE: AI Integration Languages: A Case Study in Constraint Machine Learning
Clone the repository:
To build the container open a terminal in this folder and run:
#+begin_src
git clone https://github.com/uwe-koeckemann/AIDDL.git
docker build -t moving-target-aiddl .
#+end_src
Enter folder and setup environment variables:
Then, to run the container:
#+begin_src
cd AIDDL
./environment.sh
docker run --rm --publish=8888:8888 moving-target-aiddl:latest
#+end_src
A local link to the jupyter notebook will appear in the console. Open the link
in a browser and select a notebook to try it. For now there is a single notebook
running linear regression through AIDDL.
#+TITLE: AI Integration Languages: A Case Study in Constraint Machine Learning
To build the container open a terminal in this folder and run:
#+begin_src
docker build -t moving-target-aiddl .
#+end_src
Then, to run the container:
#+begin_src
docker run --rm --publish=8888:8888 moving-target-aiddl:latest
#+end_src
A local link to the jupyter notebook will appear in the console. Open the link
in a browser and select a notebook to try it. For now there is a single notebook
running linear regression through AIDDL.
from moving_target_cplex import MovingTargetClsCplex, MovingTargetRegCplex
from moving_target_smt import MovingTargetClsSMT, MovingTargetRegSMT
from moving_target_ortools import MovingTargetRegOT, MovingTargetClsOT, MovingTargetClsOTCP
def get_problem(problem_type, solver='cplex'):
if problem_type == 'classification':
if solver == 'cplex':
return MovingTargetClsCplex
elif solver == 'smt':
return MovingTargetClsSMT
elif solver == 'ortools':
return MovingTargetClsOT
elif solver == 'ortools-cp':
return MovingTargetClsOTCP
else:
raise ValueError("Solver " + str(solver) + " not recognized!")
elif problem_type == 'regression':
if solver == 'cplex':
return MovingTargetRegCplex
elif solver == 'smt':
return MovingTargetRegSMT
elif solver == 'ortools':
return MovingTargetRegOT
else:
raise ValueError("Solver " + str(solver) + " not recognized!")
else:
raise ValueError("Problem type not understood: " + str(problem_type))
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"import matplotlib.pyplot as plt\n",
"plt.style.use('seaborn-whitegrid')\n",
"import numpy as np\n",
"\n",
"from aiddl_core.representation.symbolic import Symbolic as Sym\n",
"from aiddl_core.representation.variable import Variable as Var\n",
"from aiddl_core.representation.substitution import Substitution\n",
"from aiddl_core.container.container import Container\n",
"from aiddl_core.tools.combo_iterator import ComboIterator\n",
"from aiddl_core.tools.logger import Logger\n",
"from aiddl_core.function.default import get_default_function_registry\n",
"from aiddl_core.parser.parser import parse_term as parse\n",
"\n",
"module_path = os.path.abspath(os.path.join('../python/moving_target'))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from moving_target_cplex import MovingTargetRegCplex\n",
"from moving_target_cplex import MovingTargetClsCplex\n",
"from moving_target_ortools import MovingTargetRegOT\n",
"from moving_target_ortools import MovingTargetClsOT\n",
"\n",
"from factory import moving_targets_factory\n",
"from tools import CsvLoader\n",
"from utils import didi_r\n",
"from scikit_learn_wrapper import SciKitLearnFunction\n",
"from scikit_learn_wrapper import splitAiddlMlProblem\n",
"\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.metrics import mean_squared_error"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load Data\n",
"\n",
"Create container and function registry and load some example data from a local file."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Label: \"violentPerPop\"\n",
"Attributes: [\n",
" (\"pctHousOccup\" org.aiddl.term)\n",
" (\"pct12-21\" org.aiddl.term)\n",
" (\"pct16-24\" org.aiddl.term)\n",
" (\"pctWorkMom-6\" org.aiddl.term)\n",
" (\"pctLargHous\" org.aiddl.term)\n",
" (\"ownHousQrange\" org.aiddl.term)\n",
" (\"pct12-29\" org.aiddl.term)\n",
" (\"persPerOccupHous\" org.aiddl.term)\n",
" (\"persPerFam\" org.aiddl.term)\n",
" (\"rentQrange\" org.aiddl.term)\n",
" (\"pctLargHousFam\" org.aiddl.term)\n",
" (\"persPerOwnOccup\" org.aiddl.term)\n",
" (\"whitePerCap\" org.aiddl.term)\n",
" (\"pctEmployProfServ\" org.aiddl.term)\n",
" (\"pctFgnImmig-3\" org.aiddl.term)\n",
" (\"race\" org.aiddl.term)\n",
" (\"violentPerPop\" org.aiddl.term)\n",
" ]\n",
"\n",
"Data (first row): [0.9764342597107102 0.13917940466613032 0.10512037978975924 0.7447776280323449 0.0486362142622412 0.31993957703927495 0.21429739898576802 0.2271062271062273 0.25106382978723407 0.20921544209215442 0.05278678855794751 0.31046931407942224 0.2408048532587981 0.35711430855315746 0.14779575007928955 0.0 0.03989083587243134]\n"
]
}
],
"source": [
"C = Container()\n",
"F = get_default_function_registry(C)\n",
"\n",
"loader = CsvLoader()\n",
"\n",
"test_data = loader.apply(parse('(\"../resources/crime_test.csv\" \",\" \"violentPerPop\")'))\n",
"train_data = loader.apply(parse('(\"../resources/crime_train.csv\" \",\" \"violentPerPop\")'))\n",
"\n",
"x, y = splitAiddlMlProblem(train_data)\n",
"x_test, y_test = splitAiddlMlProblem(test_data)\n",
"\n",
"print(\"Label:\", train_data[Sym(\"label\")])\n",
"print(\"Attributes:\", Logger.pretty_print(train_data[Sym(\"attributes\")], 1))\n",
"print(\"Data (first row):\", train_data[Sym(\"data\")][0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Without Moving Targets\n",
"\n",
"Fairness and score after a single learning step (without moving targets). Note the DIDI calculation protects the the last non-label attribute in our data set."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DIDI (pre-learning): 0.18046885714845506\n",
"DIDI (post-learning): 0.2278699416515176\n",
"MSE (test data): 0.007579562360896998\n"
]
}
],
"source": [
"didi_init = didi_r(np.array(x_test), np.array(y_test), [len(x[0])-1])\n",
"\n",
"print(\"DIDI (pre-learning):\", didi_init)\n",
"\n",
"learner = parse('''{\n",
" py_module:sklearn.ensemble\n",
" py_class:GradientBoostingRegressor\n",
" n_estimators:50\n",
" min_samples_leaf:5\n",
"}''')\n",
"\n",
"f_ML = SciKitLearnFunction(learner)\n",
"F.add_function(Sym(\"mt.learner\"), f_ML)\n",
"f_ML(train_data)\n",
"y_0 = f_ML.predict(x_test)\n",
"\n",
"didi_0 = didi_r(np.array(x_test), np.array(y_0), [len(x[0])-1])\n",
"mse_0 = mean_squared_error(y_0, y_test)\n",
"\n",
"print(\"DIDI (post-learning):\", didi_0)\n",
"print(\"MSE (test data):\", mse_0)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Moving Targets\n",
"\n",
"## Parameters\n",
"\n",
"Below we set the following parameters to configure moving targets:\n",
"\n",
"- **n**: Maximum number of iterations\n",
"- **alpha**: Parameter alpha\n",
"- **beta**: Parameter alpha\n",
"- **problem-type**: *classification* or *regression*\n",
"- **constraint-solver**: *cplex*, *ortools*, or *pysmt*\n",
"- **learner**: Symboli name of machine learning function\n",
"\n",
"The corresponding AIDDL term is a tuple *(k1:v2 k2:v2)* of key value pairs *key:value*. Here we parse the information from a string but it can also be loaded from an entry in an AIDDL file."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"cfg = parse('''(\n",
" n:10 \n",
" alpha:1.0 \n",
" beta:1.0 \n",
" problem-type:regression \n",
" constraint-solver:ortools \n",
" learner:mt.learner\n",
")''')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loss Function, Constraints and Data\n",
"\n",
"- **loss-function**: \n",
" - **regression**: *MeanAbsoluteError* or *MeanSquaredError*\n",
" - **classification**: *HammingDistance*\n",
"- **constraints**: AIDDL set *{c1 ...}* of constraints\n",
" - **(< y c)**: attribute *y* should be less than constant *c* (same for <=, >, >=)\n",
" - **(balance y q)**: attribute *y* balanced with unbalance at most *q*\n",
" - **(didi y [x1 x2] q)**: model for *y* should be fair wrt. *x1* and *x2* with unfairness at most *q* \n",
"\n",
"The last line below adds the converted data set with fields *data*, *label*, and *attributes*. As a result *mt_data* will contain all information required by machine learning, constraint solver, and moving targets in a single set. Note that despite the additional information, *mt_data* could still be used directly as input to a machine learning algorithm. "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"mt_data = parse('''{\n",
" loss-function:MeanAbsoluteError\n",
" constraints:{\n",
" (didi \"violentPerPop\" [\"race\"] 0.01)\n",
" }\n",
"}''')\n",
"mt_data = mt_data.put_all(train_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Instance of Moving Targets\n",
"\n",
"Factory method is used to create a version of moving targets for the specified problem type and with the selected constraint solver. We also provide a function registry *F* that allows us to look up the learning function based on the symbolic name (*mt.learner* above), as well as the test data that will be used on each iteration of moving targets to track performance on non-training data."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"f_MT = moving_targets_factory(cfg, F, test_data=x_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Apply Moving Targets to Input"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initializing model\n",
"('didi', 'violentPerPop', [15], 0.01)\n",
"Constraint added: ('didi', 'violentPerPop', [15], 0.01)\n",
"Constraint satisfaction: False\n",
"Solving\n",
"Constraint satisfaction: False\n",
"Solving\n",
"Constraint satisfaction: False\n",
"Solving\n",
"Constraint satisfaction: False\n",
"Solving\n",
"Constraint satisfaction: False\n",
"Solving\n",
"Constraint satisfaction: False\n",
"Solving\n",
"Constraint satisfaction: False\n",
"Solving\n",
"Constraint satisfaction: False\n",
"Solving\n",
"Constraint satisfaction: True\n",
"Solving\n",
"Constraint satisfaction: True\n",
"Solving\n"
]
}
],
"source": [
"y_k = f_MT(mt_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test Results\n",
"\n",
"Below we print the MSE and DIDI on the test data for each iteration. We would expect fairness to increate (i.e., lower DIDI value) while the MSE increases as a trade-off."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"i = 0\n",
"\n",
"print(\"n MSE DIDI\")\n",
"for y_k in f_MT.y_k_test_history:\n",
" mse_k = mean_squared_error(y_k, y_test)\n",
" didi_k = didi_r(np.array(x_test), y_k, [len(x[0])-1])\n",
" print(\"%d %.4f %.4f\" % (i, mse_k, didi_k))\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": false,
"sideBar": false,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": false,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"import matplotlib.pyplot as plt\n",
"plt.style.use('seaborn-whitegrid')\n",
"import numpy as np\n",
"\n",
"from aiddl_core.representation.symbolic import Symbolic as Sym\n",
"from aiddl_core.representation.variable import Variable as Var\n",
"from aiddl_core.representation.real import Real\n",
"from aiddl_core.representation.substitution import Substitution\n",
"from aiddl_core.container.container import Container\n",
"from aiddl_core.tools.combo_iterator import ComboIterator\n",
"from aiddl_core.tools.logger import Logger\n",
"from aiddl_core.function.default import get_default_function_registry\n",
"from aiddl_core.parser.parser import parse_term as parse\n",
"\n",
"module_path = os.path.abspath(os.path.join('../python/moving_target'))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from moving_target_cplex import MovingTargetRegCplex\n",
"from moving_target_cplex import MovingTargetClsCplex\n",
"from moving_target_ortools import MovingTargetRegOT\n",
"from moving_target_ortools import MovingTargetClsOT\n",
"\n",
"from factory import moving_targets_factory\n",
"from tools import CsvLoader\n",
"from utils import didi_r\n",
"from scikit_learn_wrapper import SciKitLearnFunction\n",
"from scikit_learn_wrapper import splitAiddlMlProblem\n",
"\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.metrics import mean_squared_error"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup\n",
"\n",
"Create container and function registry and load some example data from a local file."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Label: \"violentPerPop\"\n",
"Attributes: [\n",
" (\"pctHousOccup\" org.aiddl.term)\n",
" (\"pct12-21\" org.aiddl.term)\n",
" (\"pct16-24\" org.aiddl.term)\n",
" (\"pctWorkMom-6\" org.aiddl.term)\n",
" (\"pctLargHous\" org.aiddl.term)\n",
" (\"ownHousQrange\" org.aiddl.term)\n",
" (\"pct12-29\" org.aiddl.term)\n",
" (\"persPerOccupHous\" org.aiddl.term)\n",
" (\"persPerFam\" org.aiddl.term)\n",
" (\"rentQrange\" org.aiddl.term)\n",
" (\"pctLargHousFam\" org.aiddl.term)\n",
" (\"persPerOwnOccup\" org.aiddl.term)\n",
" (\"whitePerCap\" org.aiddl.term)\n",
" (\"pctEmployProfServ\" org.aiddl.term)\n",
" (\"pctFgnImmig-3\" org.aiddl.term)\n",
" (\"race\" org.aiddl.term)\n",
" (\"violentPerPop\" org.aiddl.term)\n",
" ]\n",
"\n",
"Data (first row): [0.9764342597107102 0.13917940466613032 0.10512037978975924 0.7447776280323449 0.0486362142622412 0.31993957703927495 0.21429739898576802 0.2271062271062273 0.25106382978723407 0.20921544209215442 0.05278678855794751 0.31046931407942224 0.2408048532587981 0.35711430855315746 0.14779575007928955 0.0 0.03989083587243134]\n"
]
}
],
"source": [
"C = Container()\n",
"F = get_default_function_registry(C)\n",
"\n",
"loader = CsvLoader()\n",
"\n",
"test_data = loader.apply(parse('(\"../resources/crime_test.csv\" \",\" \"violentPerPop\")'))\n",
"train_data = loader.apply(parse('(\"../resources/crime_train.csv\" \",\" \"violentPerPop\")'))\n",
"\n",
"x, y = splitAiddlMlProblem(train_data)\n",
"x_test, y_test = splitAiddlMlProblem(test_data)\n",
"\n",
"print(\"Label:\", train_data[Sym(\"label\")])\n",
"print(\"Attributes:\", Logger.pretty_print(train_data[Sym(\"attributes\")], 1))\n",
"print(\"Data (first row):\", train_data[Sym(\"data\")][0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Experimenting with the Crime Data Set\n",
"\n",
"To create a set of experiments, we simply replace parameters in configuration and constraints by variables.\n",
"For each variable, we create a list of choices and iterate over all possible combinations. We can replace parameters of constraints or full constraints in the same way.\n",
"\n",
"## Experimental Setup\n",
"### Learner Configuration\n",
"Select and configure learners. Each entry in the set below is a key-value pair. The key is a symbolic name that can be used to refer to the function and the value is a set containing the details to configure the learner."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"learner_configs = parse('''{\n",
" gradient-boosting-regressor:\n",
" {\n",
" py_module:sklearn.ensemble\n",
" py_class:GradientBoostingRegressor\n",
" n_estimators:50\n",
" min_samples_leaf:5\n",
" }\n",
"}''')\n",
"\n",
"for l_cfg in learner_configs:\n",
" f_ML = SciKitLearnFunction(l_cfg.get_value())\n",
" F.add_function(l_cfg.get_key(), f_ML)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Moving Target Configuration\n",
"\n",
"The following term contains the parameters of the moving targets algorithm. \n",
"Variable are indicated by a question mark. We specify possible values for all variables below. "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"mt_cfg_exp = parse('''(\n",
" n:10 ;; Number of iterations\n",
" alpha:?a ;; Parameter alpha\n",
" beta:?b ;; Parameter beta\n",
" problem-type:regression ;; classification/regression?\n",
" constraint-solver:cplex ;; Select constraint solver (cplex/pysmt)\n",
" learner:gradient-boosting-regressor ;; URI of learner (registered above)\n",
")''')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Input Configuration\n",
"\n",
"Here we set loss function and constraints to be used. "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"mt_data_exp = parse('''{\n",
" loss-function:MeanSquaredError ;; Select loss function\n",
" constraints:{\n",
</