diff --git a/apply_model/README.md b/apply_model/README.md index 00a098d..5ce90d5 100644 --- a/apply_model/README.md +++ b/apply_model/README.md @@ -14,3 +14,9 @@ * [Apply CatBoost model from Rust](./rust/train_model.ipynb) * Explore how to apply CatBoost model from Rust application. If you just want to look at code snippets you can go directly to [main.rs](./rust/src/main.rs) + +* [Apply CatBoost model from C](./clang/train_model.ipynb) + * Explore how to apply CatBoost model from C application. If you just want to look at code snippets you can go directly to [main.c](./clang/src/main.c) + +* [Apply CatBoost model from D](./dlang/train_model.ipynb) + * Explore how to apply CatBoost model from D application. If you just want to look at code snippets you can go directly to [main.d](./dlang/src/main.d) diff --git a/apply_model/clang/readme.md b/apply_model/clang/readme.md new file mode 100644 index 0000000..b6a819f --- /dev/null +++ b/apply_model/clang/readme.md @@ -0,0 +1,14 @@ +# Apply CatBoost model from C +This tutorial consists of two parts: +- first part where we preprocess dataset and train the classifier model. + This part can be found in [train_model.ipynb](train_model.ipynb). +- second part where we load model into C application and then apply it. + This part presented as a C file. At first you need to build a library, as it is suggested on [Evaluation library](https://catboost.ai/en/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper). To run, you can execute: + * in case Linux/macOS + + `clang -L -lcatboostmodel` + * in case Windows + + `cl.exe /link \catboostmodel.lib` + + If you just want to look at code snippets you can go directly to [src/main.c](src/main.c). diff --git a/apply_model/clang/src/main.c b/apply_model/clang/src/main.c new file mode 100644 index 0000000..56c4cb2 --- /dev/null +++ b/apply_model/clang/src/main.c @@ -0,0 +1,180 @@ +#include +#include + +// Bring catboost module into the scope +#include + +double sigmoid(double x) { + return 1. / (1. + exp(-x)); +} + +char* answer(bool makes_over_50k_a_year) { + if (makes_over_50k_a_year) { + return "makes over 50k a year"; + } else { + return "doesn't make over 50k a year"; + } +} + +int main(int argc, const char * argv[]) { + // Load "adult.cbm" model that we trained withing Jupyter Notebook + ModelCalcerHandle* modelHandle; + modelHandle = ModelCalcerCreate(); + if (!LoadFullModelFromFile(modelHandle, "adult.cbm")) { + printf("LoadFullModelFromFile error message: %s\n", GetErrorString()); + } + + // You can also try to load your own model just replace "adult.cbm" with path to your model that classifies data + // from UCI Adult Dataset. + + printf("Adult dataset model metainformation\n"); + + printf("tree count: %zu\n", GetTreeCount(modelHandle)); + + // In our case we were solving a binary classification problem (weather person makes over 50K a year), so the + // dimension of the prediction will be 1, it will return probability of the object to belong to the positive + // class; in our case we had two classed encoded as "<=50K" and ">50K", during data preprocessing (see + // `get_fixed_adult()` in Notebook) we encoded "<=50K" as 0 and ">50K" as 1, so that ">50K" became a positive + // class. Probability of the negative class ("<=50K") can be easily deduced as (1-p) where p is a probability of + // positive class. + // + // For most of cases prediction dimension will be 1 (for regression and for ranking), it can be N for cases of + // multiclassification, where N is a number of classes. + printf("prediction dimension: %zu\n", GetDimensionsCount(modelHandle)); + + printf("numeric feature count: %zu\n", GetFloatFeaturesCount(modelHandle)); + + printf("categoric feature count: %zu\n", GetCatFeaturesCount(modelHandle)); + + // Ok now lets try to use our model for prediction. We'll look at the test part of Adult dataset. You will need + // to download it [1] from UCI repository. Look for "adult.test", "adult.name" will also be useful because it + // in contains human-readable description of the dataset. + // + // So the first line of test part of the dataset is: + // + // "25, Private, 226802, 11th, 7, Never-married, Machine-op-inspct, Own-child, Black, Male, 0, 0, 40, United-States, <=50K." + // + // Based on "adult.name" we can recover its vectors of numeric and categoric features (in our case all + // "continuous" features are numeric and all other features are categoric): + // + // numericFeatures: {25, 226802, 7, 0, 0, 40} + // categoricFeatures: {"Private", "11th", "Never-married", "Machine-op-inspct", "Own-child", "Black", "Male", "United-States"} + // + // And he doesn't make 50K per year. Also note that order of numeric and categoric features in source data and + // in `numericFeatures` and `categoricFeatures` is kept the same. Otherwise we can't apply the model (well, we + // can, but result of prediction will be garbage). + // + // Now lets run it! And let's call this person "person A", to make variable names unique. + // + // [1]: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/ + + printf("\n"); + + float pers_a_num_feat[6] = {25., 226802., 7., 0., 0., 40.}; + char* pers_a_cat_feat[8] = {"Private","11th","Never-married","Machine-op-inspct","Own-child","Black","Male","United-States"}; + + double result_a[1]; + + const float* a_num_feat_ptr = pers_a_num_feat; + const char** a_cat_feat_ptr = pers_a_cat_feat; + + if (!CalcModelPrediction( + modelHandle, + 1, + &a_num_feat_ptr, 6, + &a_cat_feat_ptr, 8, + &result_a, 1) + ) { + printf("CalcModelPrediction error message: %s\n", GetErrorString()); + } + + // Since we made prediction only for one person and prediction dimension is 1, proability of person A make + // over 50K will have index 0 in `person_a_prediction`. + // + // CatBoost doesn't compute "probability", to turn CatBoost prediction into a probability we'll need to apply + // sigmoid function. + double pers_a_makes_over_50k_prob = sigmoid(result_a[0]); + printf("Person A make over 50K a year with probability %f\n", pers_a_makes_over_50k_prob); + + // When we were training CatBoost we used a default classification threshold for AUC which is equal to 0.5, + // this means that our formula is optimized for this threashold, though we may change threshold to optimize some + // other metric on a different dataset, but we won't do it in this tutorial. + double classification_threshold = 0.5; + + bool pers_a_makes_over_50k = pers_a_makes_over_50k_prob > classification_threshold; + printf("Person A %s\n", answer(pers_a_makes_over_50k)); + + // Now lets find an example with missing features and income greater than 50K a year. At line 40 of "adult.test" + // we can find following line: + // + // "40, Private, 85019, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 45, ?, >50K." + // + // Lets call this person "Person B", dataset missing (missing features are marked with "?") "native-county" + // feature for Person B. When we were doing preprocessing in `get_fixed_adult` we replaced missing categoric + // features with string "nan", now, when we apply trained model we must also use "nan" for missing features. + // Lets write out feature vectors for Person B: + // + // numericFeatures = {40, 85019, 16, 0, 0, 45}; + // categoricFeatures = {"Private", "Doctorate", "Married-civ-spouce", "Prof-specialty", "Husband", "Asian-Pac-Islander", "Male", "nan"}; + // + // And according to the dataset Person B makes more than 50K a year. Ok, lets try to apply the model to this + // example. + + printf("\n"); + + float pers_b_num_feat[w] = {40., 85019., 16., 0., 0., 45.}; + char* pers_b_cat_feat[8] = {"Private","Doctorate","Married-civ-spouce","Prof-specialty","Husband","Asian-Pac-Islander","Male","nan"}; + + double result_b[1]; + + const float* b_num_feat_ptr = pers_b_num_feat; + const char** b_cat_feat_ptr = pers_b_cat_feat; + + if (!CalcModelPrediction( + modelHandle, + 1, + &b_num_feat_ptr, 6, + &b_cat_feat_ptr, 8, + &result_b, 1) + ) { + printf("CalcModelPrediction error message: %s\n", GetErrorString()); + } + double pers_b_makes_over_50k_prob = sigmoid(result_b[0]); + bool pers_b_makes_over_50k = pers_b_makes_over_50k_prob > classification_threshold; + printf("Person B make over 50K a year with probability %f\n", pers_b_makes_over_50k_prob); + printf("Person B %s\n", answer(pers_b_makes_over_50k)); + + // Let's try to apply the model to Person A and Person B in one call. + printf("\n"); + + float* pers_ab_num_feat[2] = {pers_a_num_feat, pers_b_num_feat}; + char** pers_ab_cat_feat[2] = {pers_a_cat_feat, pers_b_cat_feat}; + + double result_ab[2]; + + const float** ab_num_feat_ptr = (const float**)pers_ab_num_feat; + const char*** ab_cat_feat_ptr = (const char**)pers_ab_cat_feat; + + if (!CalcModelPrediction( + modelHandle, + 2, + ab_num_feat_ptr, 6, + ab_cat_feat_ptr, 8, + &result_ab, 2) + ) { + printf("CalcModelPrediction error message: %s\n", GetErrorString()); + } + double pers_ab_makes_over_50k_prob[2] = {sigmoid(result_ab[0]), sigmoid(result_ab[1])}; + bool pers_ab_makes_over_50k[2] = {pers_ab_makes_over_50k_prob[0] > classification_threshold, pers_ab_makes_over_50k_prob[1] > classification_threshold}; + + printf("Using batch interface\n"); + + // Predictions should be same as above + printf("Person A make over 50K a year with probability %f\n", pers_ab_makes_over_50k_prob[0]); + printf("Person A %s\n", answer(pers_ab_makes_over_50k[0])); + printf("Person B make over 50K a year with probability %f\n", pers_ab_makes_over_50k_prob[1]); + printf("Person B %s\n", answer(pers_ab_makes_over_50k[1])); + + ModelCalcerDelete(modelHandle); + return 0; +} \ No newline at end of file diff --git a/apply_model/clang/train_model.ipynb b/apply_model/clang/train_model.ipynb new file mode 100644 index 0000000..0d03029 --- /dev/null +++ b/apply_model/clang/train_model.ipynb @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# catboost for clang tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q numpy pandas catboost" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import absolute_import, division, print_function, unicode_literals" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CatBoost version 0.14.2\n", + "NumPy version 1.16.3\n", + "Pandas version 0.24.2\n" + ] + } + ], + "source": [ + "import catboost as cb\n", + "import catboost.datasets as cbd\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# print module versions for reproducibility\n", + "print('CatBoost version {}'.format(cb.__version__))\n", + "print('NumPy version {}'.format(np.__version__))\n", + "print('Pandas version {}'.format(pd.__version__))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Download \"Adult Data Set\" [1] from UCI Machine Learning Repository.\n", + "\n", + " Will return two pandas.DataFrame-s, first with train part (adult.data) and second with test part\n", + " (adult.test) of the dataset.\n", + "\n", + " [1]: https://archive.ics.uci.edu/ml/datasets/Adult\n", + " \n" + ] + } + ], + "source": [ + "# We are going to use UCI Adult Data Set because it has both numerical and categorical \n", + "# features and also has missing features.\n", + "print(cbd.adult.__doc__)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def get_fixed_adult():\n", + " train, test = cbd.adult()\n", + " \n", + " # CatBoost doesn't support pandas.DataFrame missing values for categorical features out \n", + " # of the box (seed issue #571 on GitHub or issue MLTOOLS-2785 in internal tracker). So \n", + " # we have to replace them with some designated string manually. \n", + " for dataset in (train, test, ):\n", + " for name in (name for name, dtype in dict(dataset.dtypes).items() if dtype == np.object):\n", + " dataset[name].fillna('nan', inplace=True)\n", + " \n", + " X_train, y_train = train.drop('income', axis=1), train.income\n", + " X_test, y_test = test.drop('income', axis=1), test.income\n", + " return X_train, y_train, X_test, y_test" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, y_train, _, _ = get_fixed_adult()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
039.0State-gov77516.0Bachelors13.0Never-marriedAdm-clericalNot-in-familyWhiteMale2174.00.040.0United-States
150.0Self-emp-not-inc83311.0Bachelors13.0Married-civ-spouseExec-managerialHusbandWhiteMale0.00.013.0United-States
238.0Private215646.0HS-grad9.0DivorcedHandlers-cleanersNot-in-familyWhiteMale0.00.040.0United-States
353.0Private234721.011th7.0Married-civ-spouseHandlers-cleanersHusbandBlackMale0.00.040.0United-States
428.0Private338409.0Bachelors13.0Married-civ-spouseProf-specialtyWifeBlackFemale0.00.040.0Cuba
\n", + "
" + ], + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "0 39.0 State-gov 77516.0 Bachelors 13.0 \n", + "1 50.0 Self-emp-not-inc 83311.0 Bachelors 13.0 \n", + "2 38.0 Private 215646.0 HS-grad 9.0 \n", + "3 53.0 Private 234721.0 11th 7.0 \n", + "4 28.0 Private 338409.0 Bachelors 13.0 \n", + "\n", + " marital-status occupation relationship race sex \\\n", + "0 Never-married Adm-clerical Not-in-family White Male \n", + "1 Married-civ-spouse Exec-managerial Husband White Male \n", + "2 Divorced Handlers-cleaners Not-in-family White Male \n", + "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", + "4 Married-civ-spouse Prof-specialty Wife Black Female \n", + "\n", + " capital-gain capital-loss hours-per-week native-country \n", + "0 2174.0 0.0 40.0 United-States \n", + "1 0.0 0.0 13.0 United-States \n", + "2 0.0 0.0 40.0 United-States \n", + "3 0.0 0.0 40.0 United-States \n", + "4 0.0 0.0 40.0 Cuba " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: Custom metrics will not be evaluated because there are no test datasets\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# If you want to find out how we found these parameters check \"Simple classification \n", + "# example with missing feature handling and parameter tuning\" tutorial in `classification`\n", + "# subdirectory of tutorials\n", + "model = cb.CatBoostClassifier(\n", + " class_names=('<=50K', '>50K'),\n", + " loss_function='Logloss',\n", + " eval_metric='AUC', \n", + " custom_metric=['AUC'],\n", + " iterations=100,\n", + " random_seed=20181224,\n", + " learning_rate=0.4234185321620083, \n", + " depth=5, \n", + " l2_leaf_reg=9.464266235679002)\n", + "model.fit(\n", + " cb.Pool(X_train, y_train, cat_features=np.where(X_train.dtypes != np.float)[0]),\n", + " verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "model.save_model('adult.cbm')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "156K\tadult.cbm\r\n" + ] + } + ], + "source": [ + "!du -sh adult.cbm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We got the model, now it's time to use it via `catboost` package for C. Next part of the tutorial\n", + "will be in a C project." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/apply_model/dlang/readme.md b/apply_model/dlang/readme.md new file mode 100644 index 0000000..f48f526 --- /dev/null +++ b/apply_model/dlang/readme.md @@ -0,0 +1,24 @@ +# Apply CatBoost model from D +This tutorial consists of two parts: +- first part where we preprocess dataset and train the classifier model. + This part can be found in [train_model.ipynb](train_model.ipynb). +- second part where we load model into D application and then apply it. + This part presented as a D file. At first you need to build a library, as it is suggested on [Evaluation library](https://catboost.ai/en/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper). + + After that you need to generate preprocessed header file (.i) for ImportC functionality. To prepare preprocessed header you need to create a file [lib_import.c](src/lib_import.c) consists of one line + + `#include ` + + and use available C compiler (or built-in preprocessor in D compiler) as suggested on the page of [ImportC Documentation](https://dlang.org/spec/importc.html): + + `clang -E lib_import.c -o lib_import.i` + + To run, you can execute: + * in case Linux/macOS + + `ldc2 -L/libcatboostmodel.{so/dylib}` + * in case Windows + + `ldc2.exe /link \catboostmodel.{lib/dll}` + + If you just want to look at code snippets you can go directly to [src/main.d](src/main.d). diff --git a/apply_model/dlang/src/lib_import.c b/apply_model/dlang/src/lib_import.c new file mode 100644 index 0000000..c7ef90a --- /dev/null +++ b/apply_model/dlang/src/lib_import.c @@ -0,0 +1 @@ +#include diff --git a/apply_model/dlang/src/main.d b/apply_model/dlang/src/main.d new file mode 100644 index 0000000..39a1e81 --- /dev/null +++ b/apply_model/dlang/src/main.d @@ -0,0 +1,217 @@ +import std.stdio; +import std.math : exp; + +// Bring catboost module into the scope +import lib_import; + +double sigmoid(double x) +{ + return 1. / (1. + exp(-x)); +} + +string answer(bool makes_over_50k_a_year) +{ + if (makes_over_50k_a_year) + { + return "makes over 50k a year"; + } + else + { + return "doesn't make over 50k a year"; + } +} + +void main(string[] args) +{ + // Load "adult.cbm" model that we trained withing Jupyter Notebook + ModelCalcerHandle* modelHandle = ModelCalcerCreate(); + if (!(modelHandle.LoadFullModelFromFile("adult.cbm"))) + { + writeln("LoadFullModelFromFile error message: %s", GetErrorString()); + } + // You can also try to load your own model just replace "adult.cbm" with path to your model that classifies data + // from UCI Adult Dataset. + + writeln("Adult dataset model metainformation\n"); + + writeln("tree count: ", modelHandle.GetTreeCount()); + + // In our case we were solving a binary classification problem (weather person makes over 50K a year), so the + // dimension of the prediction will be 1, it will return probability of the object to belong to the positive + // class; in our case we had two classed encoded as "<=50K" and ">50K", during data preprocessing (see + // `get_fixed_adult()` in Notebook) we encoded "<=50K" as 0 and ">50K" as 1, so that ">50K" became a positive + // class. Probability of the negative class ("<=50K") can be easily deduced as (1-p) where p is a probability of + // positive class. + // + // For most of cases prediction dimension will be 1 (for regression and for ranking), it can be N for cases of + // multiclassification, where N is a number of classes. + writeln("prediction dimension: ", modelHandle.GetDimensionsCount()); + + writeln("numeric feature count: ", modelHandle.GetFloatFeaturesCount()); + + writeln("categoric feature count: ", modelHandle.GetCatFeaturesCount()); + + // Ok now lets try to use our model for prediction. We'll look at the test part of Adult dataset. You will need + // to download it [1] from UCI repository. Look for "adult.test", "adult.name" will also be useful because it + // in contains human-readable description of the dataset. + // + // So the first line of test part of the dataset is: + // + // "25, Private, 226802, 11th, 7, Never-married, Machine-op-inspct, Own-child, Black, Male, 0, 0, 40, United-States, <=50K." + // + // Based on "adult.name" we can recover its vectors of numeric and categoric features (in our case all + // "continuous" features are numeric and all other features are categoric): + // + // numericFeatures: {25, 226802, 7, 0, 0, 40} + // categoricFeatures: {"Private", "11th", "Never-married", "Machine-op-inspct", "Own-child", "Black", "Male", "United-States"} + // + // And he doesn't make 50K per year. Also note that order of numeric and categoric features in source data and + // in `numericFeatures` and `categoricFeatures` is kept the same. Otherwise we can't apply the model (well, we + // can, but result of prediction will be garbage). + // + // Now lets run it! And let's call this person "person A", to make variable names unique. + // + // [1]: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/ + + writeln(); + + const(float)[6] pers_a_num_feat = [25., 226_802., 7., 0., 0., 40.]; + const(char)*[8] pers_a_cat_feat = [ + "Private", + "11th", + "Never-married", + "Machine-op-inspct", + "Own-child", + "Black", + "Male", + "United-States" + ]; + + double[1] result_a = [0]; + + auto a_num_feat_ptr = pers_a_num_feat.ptr; + auto a_cat_feat_ptr = pers_a_cat_feat.ptr; + + if (!modelHandle.CalcModelPrediction( + 1, + &a_num_feat_ptr, 6, + &a_cat_feat_ptr, 8, + result_a.ptr, 1)) + { + writeln("CalcModelPrediction error message: ", GetErrorString()); + } + + // Since we made prediction only for one person and prediction dimension is 1, proability of person A make + // over 50K will have index 0 in `person_a_prediction`. + // + // CatBoost doesn't compute "probability", to turn CatBoost prediction into a probability we'll need to apply + // sigmoid function. + double pers_a_makes_over_50k_prob = sigmoid(result_a[0]); + writeln( + "Person A make over 50K a year with probability ", + pers_a_makes_over_50k_prob + ); + + // When we were training CatBoost we used a default classification threshold for AUC which is equal to 0.5, + // this means that our formula is optimized for this threashold, though we may change threshold to optimize some + // other metric on a different dataset, but we won't do it in this tutorial. + double classification_threshold = 0.5; + + bool pers_a_makes_over_50k = pers_a_makes_over_50k_prob > classification_threshold; + writeln("Person A ", answer(pers_a_makes_over_50k)); + + // Now lets find an example with missing features and income greater than 50K a year. At line 40 of "adult.test" + // we can find following line: + // + // "40, Private, 85019, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 45, ?, >50K." + // + // Lets call this person "Person B", dataset missing (missing features are marked with "?") "native-county" + // feature for Person B. When we were doing preprocessing in `get_fixed_adult` we replaced missing categoric + // features with string "nan", now, when we apply trained model we must also use "nan" for missing features. + // Lets write out feature vectors for Person B: + // + // numericFeatures = {40, 85019, 16, 0, 0, 45}; + // categoricFeatures = {"Private", "Doctorate", "Married-civ-spouce", "Prof-specialty", "Husband", "Asian-Pac-Islander", "Male", "nan"}; + // + // And according to the dataset Person B makes more than 50K a year. Ok, lets try to apply the model to this + // example. + + writeln(); + + const(float)[6] pers_b_num_feat = [40., 85_019., 16., 0., 0., 45.]; + const(char)*[8] pers_b_cat_feat = [ + "Private", + "Doctorate", + "Married-civ-spouce", + "Prof-specialty", + "Husband", + "Asian-Pac-Islander", + "Male", + "nan" + ]; + + double[1] result_b = [0]; + + auto b_num_feat_ptr = pers_b_num_feat.ptr; + auto b_cat_feat_ptr = pers_b_cat_feat.ptr; + + if (!modelHandle.CalcModelPrediction( + 1, + &b_num_feat_ptr, 6, + &b_cat_feat_ptr, 8, + result_b.ptr, 1)) + { + writeln("CalcModelPrediction error message: ", GetErrorString()); + } + + double pers_b_makes_over_50k_prob = sigmoid(result_b[0]); + bool pers_b_makes_over_50k = pers_b_makes_over_50k_prob > classification_threshold; + writeln( + "Person B make over 50K a year with probability ", + pers_b_makes_over_50k_prob + ); + writeln("Person B ", answer(pers_b_makes_over_50k)); + + // Let's try to apply the model to Person A and Person B in one call. + + writeln(); + + const(float)*[2] pers_ab_num_feat = cast(const(float)*[2])[pers_a_num_feat, pers_b_num_feat]; + const(char)**[2] pers_ab_cat_feat = cast(const(char)**[2])[pers_a_cat_feat, pers_b_cat_feat]; + + double[2] result_ab = [0, 0]; + + auto ab_num_feat_ptr = cast(const(float)**)pers_ab_num_feat; + auto ab_cat_feat_ptr = cast(const(char)***)pers_ab_cat_feat; + + if (!modelHandle.CalcModelPrediction( + 2, + ab_num_feat_ptr, 6, + ab_cat_feat_ptr, 8, + result_ab.ptr, 2)) + { + writeln("CalcModelPrediction error message: ", GetErrorString()); + } + + double[2] pers_ab_makes_over_50k_prob = [sigmoid(result_ab[0]), sigmoid(result_ab[1])]; + bool[2] pers_ab_makes_over_50k = [ + pers_ab_makes_over_50k_prob[0] > classification_threshold, + pers_ab_makes_over_50k_prob[1] > classification_threshold + ]; + + writeln("Using batch interface"); + + // Predictions should be same as above + writeln( + "Person A make over 50K a year with probability ", + pers_ab_makes_over_50k_prob[0] + ); + writeln("Person A ", answer(pers_ab_makes_over_50k[0])); + writeln( + "Person B make over 50K a year with probability ", + pers_ab_makes_over_50k_prob[1] + ); + writeln("Person B ", answer(pers_ab_makes_over_50k[1])); + + modelHandle.ModelCalcerDelete(); +} diff --git a/apply_model/dlang/train_model.ipynb b/apply_model/dlang/train_model.ipynb new file mode 100644 index 0000000..10f9ca1 --- /dev/null +++ b/apply_model/dlang/train_model.ipynb @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# catboost for dlang tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q numpy pandas catboost" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import absolute_import, division, print_function, unicode_literals" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CatBoost version 0.14.2\n", + "NumPy version 1.16.3\n", + "Pandas version 0.24.2\n" + ] + } + ], + "source": [ + "import catboost as cb\n", + "import catboost.datasets as cbd\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# print module versions for reproducibility\n", + "print('CatBoost version {}'.format(cb.__version__))\n", + "print('NumPy version {}'.format(np.__version__))\n", + "print('Pandas version {}'.format(pd.__version__))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Download \"Adult Data Set\" [1] from UCI Machine Learning Repository.\n", + "\n", + " Will return two pandas.DataFrame-s, first with train part (adult.data) and second with test part\n", + " (adult.test) of the dataset.\n", + "\n", + " [1]: https://archive.ics.uci.edu/ml/datasets/Adult\n", + " \n" + ] + } + ], + "source": [ + "# We are going to use UCI Adult Data Set because it has both numerical and categorical \n", + "# features and also has missing features.\n", + "print(cbd.adult.__doc__)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def get_fixed_adult():\n", + " train, test = cbd.adult()\n", + " \n", + " # CatBoost doesn't support pandas.DataFrame missing values for categorical features out \n", + " # of the box (seed issue #571 on GitHub or issue MLTOOLS-2785 in internal tracker). So \n", + " # we have to replace them with some designated string manually. \n", + " for dataset in (train, test, ):\n", + " for name in (name for name, dtype in dict(dataset.dtypes).items() if dtype == np.object):\n", + " dataset[name].fillna('nan', inplace=True)\n", + " \n", + " X_train, y_train = train.drop('income', axis=1), train.income\n", + " X_test, y_test = test.drop('income', axis=1), test.income\n", + " return X_train, y_train, X_test, y_test" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, y_train, _, _ = get_fixed_adult()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
039.0State-gov77516.0Bachelors13.0Never-marriedAdm-clericalNot-in-familyWhiteMale2174.00.040.0United-States
150.0Self-emp-not-inc83311.0Bachelors13.0Married-civ-spouseExec-managerialHusbandWhiteMale0.00.013.0United-States
238.0Private215646.0HS-grad9.0DivorcedHandlers-cleanersNot-in-familyWhiteMale0.00.040.0United-States
353.0Private234721.011th7.0Married-civ-spouseHandlers-cleanersHusbandBlackMale0.00.040.0United-States
428.0Private338409.0Bachelors13.0Married-civ-spouseProf-specialtyWifeBlackFemale0.00.040.0Cuba
\n", + "
" + ], + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "0 39.0 State-gov 77516.0 Bachelors 13.0 \n", + "1 50.0 Self-emp-not-inc 83311.0 Bachelors 13.0 \n", + "2 38.0 Private 215646.0 HS-grad 9.0 \n", + "3 53.0 Private 234721.0 11th 7.0 \n", + "4 28.0 Private 338409.0 Bachelors 13.0 \n", + "\n", + " marital-status occupation relationship race sex \\\n", + "0 Never-married Adm-clerical Not-in-family White Male \n", + "1 Married-civ-spouse Exec-managerial Husband White Male \n", + "2 Divorced Handlers-cleaners Not-in-family White Male \n", + "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", + "4 Married-civ-spouse Prof-specialty Wife Black Female \n", + "\n", + " capital-gain capital-loss hours-per-week native-country \n", + "0 2174.0 0.0 40.0 United-States \n", + "1 0.0 0.0 13.0 United-States \n", + "2 0.0 0.0 40.0 United-States \n", + "3 0.0 0.0 40.0 United-States \n", + "4 0.0 0.0 40.0 Cuba " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: Custom metrics will not be evaluated because there are no test datasets\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# If you want to find out how we found these parameters check \"Simple classification \n", + "# example with missing feature handling and parameter tuning\" tutorial in `classification`\n", + "# subdirectory of tutorials\n", + "model = cb.CatBoostClassifier(\n", + " class_names=('<=50K', '>50K'),\n", + " loss_function='Logloss',\n", + " eval_metric='AUC', \n", + " custom_metric=['AUC'],\n", + " iterations=100,\n", + " random_seed=20181224,\n", + " learning_rate=0.4234185321620083, \n", + " depth=5, \n", + " l2_leaf_reg=9.464266235679002)\n", + "model.fit(\n", + " cb.Pool(X_train, y_train, cat_features=np.where(X_train.dtypes != np.float)[0]),\n", + " verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "model.save_model('adult.cbm')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "156K\tadult.cbm\r\n" + ] + } + ], + "source": [ + "!du -sh adult.cbm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We got the model, now it's time to use it via `catboost` package for D. Next part of the tutorial\n", + "will be in a D project." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}