Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Open sidebar
Frank Rayo
fish-suitability-map
Commits
99599ac2
Commit
99599ac2
authored
Feb 23, 2022
by
Frank Rayo
🚀
Browse files
script for gradient boosted trees regression for fishing suitability
parent
2188acfb
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
735 additions
and
0 deletions
+735
-0
fish-mapping-xgb-cfa.ipynb
fish-mapping-xgb-cfa.ipynb
+735
-0
No files found.
fish-mapping-xgb-cfa.ipynb
0 → 100644
View file @
99599ac2
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import xgboost as xgb\n",
"import sklearn\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import pygraphviz # apt update -y; apt upgrade -y; apt-get install -y graphviz libgraphviz-dev pkg-config; pip install graphviz pygraphviz"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"!wget -q https://pedro.asti.dost.gov.ph/gitlab/franco/fish-suitability-map/-/raw/master/2017-2020_filtered_filtered_with_cfa.csv -O /tmp/training-2017-2020-monthly-mean.csv\n",
"!wget -q https://pedro.asti.dost.gov.ph/gitlab/franco/fish-suitability-map/-/raw/master/2021_converted.csv -O /tmp/testing-2021.csv"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>chl</th>\n",
" <th>sst</th>\n",
" <th>bath</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.132899</td>\n",
" <td>25.934444</td>\n",
" <td>-3173</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.121123</td>\n",
" <td>24.907999</td>\n",
" <td>-3173</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.116107</td>\n",
" <td>25.329166</td>\n",
" <td>-3173</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.100442</td>\n",
" <td>25.823666</td>\n",
" <td>-3173</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.081607</td>\n",
" <td>27.529062</td>\n",
" <td>-3173</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" chl sst bath\n",
"0 0.132899 25.934444 -3173\n",
"1 0.121123 24.907999 -3173\n",
"2 0.116107 25.329166 -3173\n",
"3 0.100442 25.823666 -3173\n",
"4 0.081607 27.529062 -3173"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data = pd.read_csv('/tmp/training-2017-2020-monthly-mean.csv', usecols=['bath', 'chl', 'sst'], engine='c', index_col=False)\n",
"train_labels = pd.read_csv('/tmp/training-2017-2020-monthly-mean.csv', usecols=['boat_present'], engine='c', index_col=False)\n",
"\n",
"train_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"n, bins, patches = plt.hist(train_labels, 100)\n",
"plt.xlabel('fishing suitability')\n",
"plt.ylabel('frequency')\n",
"plt.title('Histogram of training data labels')\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"xgtrain = xgb.DMatrix(train_data, train_labels)\n",
"\n",
"# for parameter values, see here https://xgboost.readthedocs.io/en/latest/parameter.html\n",
"param = {'max_depth': 5, # depth of a decision tree\n",
" 'learning_rate': 0.1,\n",
" #'min_split_loss': 1,\n",
" #'min_child_weight': 1,\n",
" 'max_delta_step': 10,\n",
" 'tree_method': 'exact',\n",
" 'predictor': 'cpu_predictor',\n",
" 'objective': 'reg:logistic'}\n",
"\n",
"# train gradient boosted trees\n",
"bst = xgb.train(param, xgtrain, num_boost_round=1000)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# plot features importance\n",
"xgb.plot_importance(bst)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 2.40.1 (20161225.0304)\n",
" -->\n",
"<!-- Title: %3 Pages: 1 -->\n",
"<svg width=\"1359pt\" height=\"392pt\"\n",
" viewBox=\"0.00 0.00 1359.18 392.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 388)\">\n",
"<title>%3</title>\n",
"<polygon fill=\"#ffffff\" stroke=\"transparent\" points=\"-4,4 -4,-388 1355.1822,-388 1355.1822,4 -4,4\"/>\n",
"<!-- 0 -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>0</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"555.0911\" cy=\"-366\" rx=\"68.7879\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"555.0911\" y=\"-362.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">bath<-210.5</text>\n",
"</g>\n",
"<!-- 1 -->\n",
"<g id=\"node2\" class=\"node\">\n",
"<title>1</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"449.0911\" cy=\"-279\" rx=\"68.7879\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"449.0911\" y=\"-275.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">bath<-333.5</text>\n",
"</g>\n",
"<!-- 0->1 -->\n",
"<g id=\"edge1\" class=\"edge\">\n",
"<title>0->1</title>\n",
"<path fill=\"none\" stroke=\"#0000ff\" d=\"M534.1489,-348.8116C518.0338,-335.585 495.5847,-317.1599 477.7957,-302.5594\"/>\n",
"<polygon fill=\"#0000ff\" stroke=\"#0000ff\" points=\"479.9402,-299.7916 469.9898,-296.1527 475.4991,-305.2025 479.9402,-299.7916\"/>\n",
"<text text-anchor=\"middle\" x=\"553.5911\" y=\"-318.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">yes, missing</text>\n",
"</g>\n",
"<!-- 2 -->\n",
"<g id=\"node3\" class=\"node\">\n",
"<title>2</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"766.0911\" cy=\"-279\" rx=\"94.4839\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"766.0911\" y=\"-275.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">chl<0.359459996</text>\n",
"</g>\n",
"<!-- 0->2 -->\n",
"<g id=\"edge2\" class=\"edge\">\n",
"<title>0->2</title>\n",
"<path fill=\"none\" stroke=\"#ff0000\" d=\"M592.328,-350.6464C627.0108,-336.3459 678.943,-314.9331 717.0012,-299.2409\"/>\n",
"<polygon fill=\"#ff0000\" stroke=\"#ff0000\" points=\"718.4483,-302.4301 726.3591,-295.3824 715.7799,-295.9586 718.4483,-302.4301\"/>\n",
"<text text-anchor=\"middle\" x=\"683.0911\" y=\"-318.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">no</text>\n",
"</g>\n",
"<!-- 3 -->\n",
"<g id=\"node4\" class=\"node\">\n",
"<title>3</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"262.0911\" cy=\"-192\" rx=\"100.1823\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"262.0911\" y=\"-188.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">leaf=-0.181823254</text>\n",
"</g>\n",
"<!-- 1->3 -->\n",
"<g id=\"edge3\" class=\"edge\">\n",
"<title>1->3</title>\n",
"<path fill=\"none\" stroke=\"#0000ff\" d=\"M407.8768,-264.3401C391.5553,-258.2004 372.7478,-250.7041 356.0911,-243 337.465,-234.385 317.3996,-223.7019 300.6101,-214.3425\"/>\n",
"<polygon fill=\"#0000ff\" stroke=\"#0000ff\" points=\"302.081,-211.1542 291.6495,-209.3006 298.6483,-217.2547 302.081,-211.1542\"/>\n",
"<text text-anchor=\"middle\" x=\"400.5911\" y=\"-231.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">yes, missing</text>\n",
"</g>\n",
"<!-- 4 -->\n",
"<g id=\"node5\" class=\"node\">\n",
"<title>4</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"449.0911\" cy=\"-192\" rx=\"68.7879\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"449.0911\" y=\"-188.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">bath<-332.5</text>\n",
"</g>\n",
"<!-- 1->4 -->\n",
"<g id=\"edge4\" class=\"edge\">\n",
"<title>1->4</title>\n",
"<path fill=\"none\" stroke=\"#ff0000\" d=\"M449.0911,-260.9735C449.0911,-249.1918 449.0911,-233.5607 449.0911,-220.1581\"/>\n",
"<polygon fill=\"#ff0000\" stroke=\"#ff0000\" points=\"452.5912,-220.0033 449.0911,-210.0034 445.5912,-220.0034 452.5912,-220.0033\"/>\n",
"<text text-anchor=\"middle\" x=\"458.0911\" y=\"-231.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">no</text>\n",
"</g>\n",
"<!-- 5 -->\n",
"<g id=\"node10\" class=\"node\">\n",
"<title>5</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"766.0911\" cy=\"-192\" rx=\"63.0888\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"766.0911\" y=\"-188.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">bath<-70.5</text>\n",
"</g>\n",
"<!-- 2->5 -->\n",
"<g id=\"edge9\" class=\"edge\">\n",
"<title>2->5</title>\n",
"<path fill=\"none\" stroke=\"#0000ff\" d=\"M766.0911,-260.9735C766.0911,-249.1918 766.0911,-233.5607 766.0911,-220.1581\"/>\n",
"<polygon fill=\"#0000ff\" stroke=\"#0000ff\" points=\"769.5912,-220.0033 766.0911,-210.0034 762.5912,-220.0034 769.5912,-220.0033\"/>\n",
"<text text-anchor=\"middle\" x=\"810.5911\" y=\"-231.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">yes, missing</text>\n",
"</g>\n",
"<!-- 6 -->\n",
"<g id=\"node11\" class=\"node\">\n",
"<title>6</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"1054.0911\" cy=\"-192\" rx=\"63.0888\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"1054.0911\" y=\"-188.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">bath<-39.5</text>\n",
"</g>\n",
"<!-- 2->6 -->\n",
"<g id=\"edge10\" class=\"edge\">\n",
"<title>2->6</title>\n",
"<path fill=\"none\" stroke=\"#ff0000\" d=\"M816.5865,-263.7462C868.2667,-248.1345 948.2556,-223.9711 1000.8961,-208.0693\"/>\n",
"<polygon fill=\"#ff0000\" stroke=\"#ff0000\" points=\"1002.1577,-211.3445 1010.7183,-205.1022 1000.1334,-204.6436 1002.1577,-211.3445\"/>\n",
"<text text-anchor=\"middle\" x=\"937.0911\" y=\"-231.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">no</text>\n",
"</g>\n",
"<!-- 7 -->\n",
"<g id=\"node6\" class=\"node\">\n",
"<title>7</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"237.0911\" cy=\"-105\" rx=\"94.4839\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"237.0911\" y=\"-101.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">chl<0.194019914</text>\n",
"</g>\n",
"<!-- 4->7 -->\n",
"<g id=\"edge5\" class=\"edge\">\n",
"<title>4->7</title>\n",
"<path fill=\"none\" stroke=\"#0000ff\" d=\"M409.5997,-177.0688C393.0201,-170.7082 373.5662,-163.1243 356.0911,-156 332.2491,-146.28 305.9177,-135.0475 284.0984,-125.6021\"/>\n",
"<polygon fill=\"#0000ff\" stroke=\"#0000ff\" points=\"285.2977,-122.3072 274.7309,-121.537 282.511,-128.7287 285.2977,-122.3072\"/>\n",
"<text text-anchor=\"middle\" x=\"400.5911\" y=\"-144.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">yes, missing</text>\n",
"</g>\n",
"<!-- 8 -->\n",
"<g id=\"node7\" class=\"node\">\n",
"<title>8</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"449.0911\" cy=\"-105\" rx=\"100.1823\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"449.0911\" y=\"-101.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">leaf=-0.180456504</text>\n",
"</g>\n",
"<!-- 4->8 -->\n",
"<g id=\"edge6\" class=\"edge\">\n",
"<title>4->8</title>\n",
"<path fill=\"none\" stroke=\"#ff0000\" d=\"M449.0911,-173.9735C449.0911,-162.1918 449.0911,-146.5607 449.0911,-133.1581\"/>\n",
"<polygon fill=\"#ff0000\" stroke=\"#ff0000\" points=\"452.5912,-133.0033 449.0911,-123.0034 445.5912,-133.0034 452.5912,-133.0033\"/>\n",
"<text text-anchor=\"middle\" x=\"458.0911\" y=\"-144.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">no</text>\n",
"</g>\n",
"<!-- 13 -->\n",
"<g id=\"node8\" class=\"node\">\n",
"<title>13</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"100.0911\" cy=\"-18\" rx=\"100.1823\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"100.0911\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">leaf=-0.129272148</text>\n",
"</g>\n",
"<!-- 7->13 -->\n",
"<g id=\"edge7\" class=\"edge\">\n",
"<title>7->13</title>\n",
"<path fill=\"none\" stroke=\"#0000ff\" d=\"M209.5338,-87.6282C200.1824,-81.7244 189.6773,-75.0827 180.0911,-69 165.7164,-59.8787 149.9082,-49.8092 136.1673,-41.044\"/>\n",
"<polygon fill=\"#0000ff\" stroke=\"#0000ff\" points=\"137.7847,-37.9243 127.472,-35.4955 134.0192,-43.8252 137.7847,-37.9243\"/>\n",
"<text text-anchor=\"middle\" x=\"224.5911\" y=\"-57.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">yes, missing</text>\n",
"</g>\n",
"<!-- 14 -->\n",
"<g id=\"node9\" class=\"node\">\n",
"<title>14</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"318.0911\" cy=\"-18\" rx=\"100.1823\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"318.0911\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">leaf=-0.170920715</text>\n",
"</g>\n",
"<!-- 7->14 -->\n",
"<g id=\"edge8\" class=\"edge\">\n",
"<title>7->14</title>\n",
"<path fill=\"none\" stroke=\"#ff0000\" d=\"M253.8744,-86.9735C265.6038,-74.3752 281.4312,-57.3755 294.4377,-43.4055\"/>\n",
"<polygon fill=\"#ff0000\" stroke=\"#ff0000\" points=\"297.0768,-45.7073 301.3294,-36.0034 291.9535,-40.9374 297.0768,-45.7073\"/>\n",
"<text text-anchor=\"middle\" x=\"292.0911\" y=\"-57.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">no</text>\n",
"</g>\n",
"<!-- 9 -->\n",
"<g id=\"node12\" class=\"node\">\n",
"<title>9</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"661.0911\" cy=\"-105\" rx=\"94.4839\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"661.0911\" y=\"-101.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">chl<0.154711366</text>\n",
"</g>\n",
"<!-- 5->9 -->\n",
"<g id=\"edge11\" class=\"edge\">\n",
"<title>5->9</title>\n",
"<path fill=\"none\" stroke=\"#0000ff\" d=\"M745.3465,-174.8116C729.5907,-161.7568 707.7227,-143.6376 690.2138,-129.1302\"/>\n",
"<polygon fill=\"#0000ff\" stroke=\"#0000ff\" points=\"692.4468,-126.4351 682.5135,-122.75 687.9807,-131.8253 692.4468,-126.4351\"/>\n",
"<text text-anchor=\"middle\" x=\"764.5911\" y=\"-144.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">yes, missing</text>\n",
"</g>\n",
"<!-- 10 -->\n",
"<g id=\"node13\" class=\"node\">\n",
"<title>10</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"873.0911\" cy=\"-105\" rx=\"100.1823\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"873.0911\" y=\"-101.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">leaf=-0.180226803</text>\n",
"</g>\n",
"<!-- 5->10 -->\n",
"<g id=\"edge12\" class=\"edge\">\n",
"<title>5->10</title>\n",
"<path fill=\"none\" stroke=\"#ff0000\" d=\"M788.7444,-175.0172C796.5282,-169.0847 805.2529,-162.3263 813.0911,-156 823.668,-147.4632 835.0792,-137.8649 845.0814,-129.3173\"/>\n",
"<polygon fill=\"#ff0000\" stroke=\"#ff0000\" points=\"847.4497,-131.897 852.758,-122.7279 842.8904,-126.5855 847.4497,-131.897\"/>\n",
"<text text-anchor=\"middle\" x=\"839.0911\" y=\"-144.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">no</text>\n",
"</g>\n",
"<!-- 11 -->\n",
"<g id=\"node16\" class=\"node\">\n",
"<title>11</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"1054.0911\" cy=\"-105\" rx=\"63.0888\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"1054.0911\" y=\"-101.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">bath<-84.5</text>\n",
"</g>\n",
"<!-- 6->11 -->\n",
"<g id=\"edge15\" class=\"edge\">\n",
"<title>6->11</title>\n",
"<path fill=\"none\" stroke=\"#0000ff\" d=\"M1054.0911,-173.9735C1054.0911,-162.1918 1054.0911,-146.5607 1054.0911,-133.1581\"/>\n",
"<polygon fill=\"#0000ff\" stroke=\"#0000ff\" points=\"1057.5912,-133.0033 1054.0911,-123.0034 1050.5912,-133.0034 1057.5912,-133.0033\"/>\n",
"<text text-anchor=\"middle\" x=\"1098.5911\" y=\"-144.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">yes, missing</text>\n",
"</g>\n",
"<!-- 12 -->\n",
"<g id=\"node17\" class=\"node\">\n",
"<title>12</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"1235.0911\" cy=\"-105\" rx=\"100.1823\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"1235.0911\" y=\"-101.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">leaf=-0.178304657</text>\n",
"</g>\n",
"<!-- 6->12 -->\n",
"<g id=\"edge16\" class=\"edge\">\n",
"<title>6->12</title>\n",
"<path fill=\"none\" stroke=\"#ff0000\" d=\"M1094.4967,-177.9248C1111.046,-171.7584 1130.2322,-164.0875 1147.0911,-156 1164.5258,-147.6363 1183.147,-137.0959 1198.7495,-127.7714\"/>\n",
"<polygon fill=\"#ff0000\" stroke=\"#ff0000\" points=\"1200.8087,-130.6165 1207.5554,-122.4475 1197.187,-124.6262 1200.8087,-130.6165\"/>\n",
"<text text-anchor=\"middle\" x=\"1184.0911\" y=\"-144.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">no</text>\n",
"</g>\n",
"<!-- 15 -->\n",
"<g id=\"node14\" class=\"node\">\n",
"<title>15</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"577.0911\" cy=\"-18\" rx=\"100.1823\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"577.0911\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">leaf=-0.180237263</text>\n",
"</g>\n",
"<!-- 9->15 -->\n",
"<g id=\"edge13\" class=\"edge\">\n",
"<title>9->15</title>\n",
"<path fill=\"none\" stroke=\"#0000ff\" d=\"M643.6863,-86.9735C631.5224,-74.3752 615.1088,-57.3755 601.6206,-43.4055\"/>\n",
"<polygon fill=\"#0000ff\" stroke=\"#0000ff\" points=\"603.9376,-40.7663 594.4737,-36.0034 598.9017,-45.6285 603.9376,-40.7663\"/>\n",
"<text text-anchor=\"middle\" x=\"669.5911\" y=\"-57.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">yes, missing</text>\n",
"</g>\n",
"<!-- 16 -->\n",
"<g id=\"node15\" class=\"node\">\n",
"<title>16</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"795.0911\" cy=\"-18\" rx=\"100.1823\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"795.0911\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">leaf=-0.175607845</text>\n",
"</g>\n",
"<!-- 9->16 -->\n",
"<g id=\"edge14\" class=\"edge\">\n",
"<title>9->16</title>\n",
"<path fill=\"none\" stroke=\"#ff0000\" d=\"M688.7103,-87.7248C698.0663,-81.8281 708.5585,-75.1662 718.0911,-69 732.0067,-59.9987 747.2374,-49.9523 760.4529,-41.172\"/>\n",
"<polygon fill=\"#ff0000\" stroke=\"#ff0000\" points=\"762.4266,-44.0628 768.8123,-35.6087 758.5483,-38.2354 762.4266,-44.0628\"/>\n",
"<text text-anchor=\"middle\" x=\"749.0911\" y=\"-57.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">no</text>\n",
"</g>\n",
"<!-- 17 -->\n",
"<g id=\"node18\" class=\"node\">\n",
"<title>17</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"1033.0911\" cy=\"-18\" rx=\"100.1823\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"1033.0911\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">leaf=-0.177121475</text>\n",
"</g>\n",
"<!-- 11->17 -->\n",
"<g id=\"edge17\" class=\"edge\">\n",
"<title>11->17</title>\n",
"<path fill=\"none\" stroke=\"#0000ff\" d=\"M1049.7399,-86.9735C1046.8679,-75.0751 1043.0482,-59.2508 1039.792,-45.7606\"/>\n",
"<polygon fill=\"#0000ff\" stroke=\"#0000ff\" points=\"1043.1855,-44.9029 1037.4368,-36.0034 1036.381,-46.5455 1043.1855,-44.9029\"/>\n",
"<text text-anchor=\"middle\" x=\"1089.5911\" y=\"-57.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">yes, missing</text>\n",
"</g>\n",
"<!-- 18 -->\n",
"<g id=\"node19\" class=\"node\">\n",
"<title>18</title>\n",
"<ellipse fill=\"none\" stroke=\"#000000\" cx=\"1251.0911\" cy=\"-18\" rx=\"100.1823\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"1251.0911\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">leaf=-0.159216523</text>\n",
"</g>\n",
"<!-- 11->18 -->\n",
"<g id=\"edge18\" class=\"edge\">\n",
"<title>11->18</title>\n",
"<path fill=\"none\" stroke=\"#ff0000\" d=\"M1089.7151,-89.91C1104.6828,-83.5238 1122.2609,-75.9637 1138.0911,-69 1160.2161,-59.2672 1184.675,-48.2592 1205.1625,-38.9678\"/>\n",
"<polygon fill=\"#ff0000\" stroke=\"#ff0000\" points=\"1206.6156,-42.152 1214.2735,-34.8303 1203.7212,-35.7784 1206.6156,-42.152\"/>\n",
"<text text-anchor=\"middle\" x=\"1179.0911\" y=\"-57.8\" font-family=\"Times,serif\" font-size=\"14.00\" fill=\"#000000\">no</text>\n",
"</g>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.files.Source at 0x7f1cfd3e9a90>"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# visualize a decision tree\n",
"xgb.to_graphviz(bst, num_trees=1)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>chl</th>\n",
" <th>sst</th>\n",
" <th>bath</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6.339128</td>\n",
" <td>18.666000</td>\n",
" <td>-2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>21.735666</td>\n",
" <td>-2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>22.866667</td>\n",
" <td>-2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>25.441250</td>\n",
" <td>-2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>29.320999</td>\n",
" <td>-2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" chl sst bath\n",
"0 6.339128 18.666000 -2\n",
"1 NaN 21.735666 -2\n",
"2 NaN 22.866667 -2\n",
"3 NaN 25.441250 -2\n",
"4 NaN 29.320999 -2"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# read test data\n",
"test_data = pd.read_csv('/tmp/testing-2021.csv', usecols=['bath', 'chl', 'sst'])\n",
"test_labels = pd.read_csv('/tmp/testing-2021.csv', usecols=['boat_present'])\n",
"\n",
"test_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAgvUlEQVR4nO3dfZgcZZX+8e8NAYQMBJfo/CAgQUxEIIBmBFRcZ1ZcAyLoihKMaBDMKqL4ghe4KiK4uyIiKqAxKMaXwIjoQhZ50Y2MWdFgkhUIAdEYogQkEQKBgahEzu+PegYqbc1MTWdquntyf66rr1R1PfXUOd2TPl1PVVcpIjAzM6u1VaMDMDOz5uQCYWZmhVwgzMyskAuEmZkVcoEwM7NCLhBmZlbIBcI2IWm5pM5Gx9FIkt4o6V5JvZJe3IDth6QXpOnZkj4xAtucKeln/SybmGIaU6KfTkmr64yh7nWtGi4QWxBJqyQdXvPcJh8MEbFfRPQM0k/pD4wW9Tng1Ihoi4hfNTKQiHh3RJw7WDtJPZJOHomYbMvhAmFNpwkKz57A8uHoqAlyMaubC4RtIr+XIelgSUskPSppjaTPp2YL07+PpGGYl0naStLHJf1e0lpJ35I0Ltfv29OyhyR9omY7Z0u6StJ3JD0KzEzb/oWkRyT9UdLFkrbN9ReSTpH0W0mPSTpX0t6Sfp7ivTLfvibHwlglbSepF9gauE3S7/pZPyS9X9JKSQ9KOl/SVmnZTEk3S7pQ0kPA2anfz0n6Q3odZ0vaPtffR1KO90t6Z8225kr6dG7+GEm3phx/J2mapH8HXglcnN6Pi1PbfST9WNI6SXdLekuun10kzU/9/BLYe8A/jE1jOlHSXel1XynpXwva/Ft6bVZJmpF7fsDXoqaPMyTdl7Zzt6RXl43RhklE+LGFPIBVwOE1z80EflbUBvgFcEKabgMOTdMTgQDG5NZ7J7ACeH5q+wPg22nZvkAvcBiwLdkQzpO57Zyd5t9A9qVle2AqcCgwJm3vLuADue0FcA2wE7Af8BdgQdr+OOBO4B39vA79xprr+wUDvI4B3AT8A/A84DfAybnXcyPwvhT79sCFwPzUfkfgv4H/TO2nAWuA/YGxwOX57QNzgU+n6YOB9cBr0us0AdgnLevpiyHNjwXuBU5McbwYeBDYNy3vBq5M7fYH7sv/HdTku8n7DbyOrKAIeBXwBPCStKwz5f95YLu0/HHghWn5QK9FJ7A6Tb8wxb9bLoa9G/1/aEt7NDyAuoKGy4C1wB0l278lfWAsBy5vdPwNfN1WkX1QP5J7PEH/BWIh8ClgfE0/m3xgpOcWAKfk5l9I9qE/BjgLuCK3bAfgr2xaIBYOEvsHgP/KzQfwitz8UuCM3PwFwBf66avfWHN9D1YgpuXmTwEWpOmZwB9yy5Q+IPfOPfcy4J7c3/Jncssm03+B+CpwYT8x9bBpgTgO+N+aNl8FPkm2h/QkqbikZf9ByQJRsPxq4LQ03UlWIMbmll8JfKLEa9HJMwXiBWT/xw8Htmn0/50t9dGqQ0xzyb55DUrSJOCjZB8m+5F90GzJ3hARO/c9yD7c+nMS2QfWryUtlnTUAG13A36fm/89WXFoT8vu7VsQEU8AD9Wsf29+RtJkSddKeiANO/0HML5mnTW56Q0F8211xFpWPt7fpz6Llj2HrCAuTcNljwA3pOf7Yqntqz97AIXDXgX2BA7p22ba7gzg/6VtjxnCdjch6QhJi9LQ1SPAkWz63jwcEY/X9L0bg78WT4uIFWT/V88G1krqlrRbbTurVksWiIhYCKzLP5fGn2+QtFTS/0raJy16F3BJRDyc1l07wuG2rIj4bUQcDzwXOA+4StJYsm+Tte4n+1Dq8zyyb5JrgD8Cu/ctSGPOu9Rurmb+K8CvgUkRsRPwb2TfQIfDQLGWtUfN+vfn5vO5PEhWrPbLFeZxEdFXvP5Y0Fd/7qX/YwW1r9+9wE/zXwYiOyvrPcCfyPItu92nSdoO+D7ZMGF7+pJxHZu+N89Ofyf5vu9n8Ndi04QiLo+Iw8jeqyD7G7QR1JIFoh9zgPdFxFTgdODL6fnJwOR04HCRpFJ7HgaS3ibpORHxFNlwFMBTZB8wT5GN4fe5AvigpL0ktZF94/9uRGwErgJeL+nl6cDx2Qz+Yb8j8CjQm4r9e4YprcFiLesjkp4taQ/gNOC7RY3Sa3cpcKGk5wJImiDptanJlWQH5feVtAPZEFB/vg6cKOnV6UD7hNwXoTVs+n5cS/Z3f4KkbdLjpZJeFBF/IzvucrakHSTtC7yjZN7bkh1b+BOwUdIRwD8XtPuUpG0lvRI4CvheidfiaZJeKOmfUkH6M1lheapkjDZMRkWBSP/JXw58T9KtZGOtu6bFY4BJZOObxwOXStp55KNsSdOA5crO7PkiMD0iNqQhon8Hbk5DBYeSjaV/m+y4xT1k/6nfBxARy9N0N9k35l6y8eW/DLDt04G3Ao+RfagUfgDXqd9Yh+AasuMetwI/JPvw7s8ZZAfFF6Xhsv8hO+5BRFwPfAH4SWrzk/46iYhfkh10vpDsYPVPeWZP6IvAsZIelvSliHiM7IN7Otm39wfIvoFvl9qfSjYE9wDZkO03yiSd+n0/WWF7mOw9ml/T7IG07H5gHvDuiPj1YK9Fje2Az5DtdTxAthf70TIx2vBRRGveMEjSRODaiNhf0k7A3RGxa0G72cAtEfGNNL8AODMiFo9owPa0VNAfIRs+uqfB4QyZpCCLfUWjYzGr0qjYg4iIR4F7JL0ZQJkD0+KryfYekDSebMhpZQPC3KJJen0azhhLNn69jOyMKTNrUi1ZICRdQXaO/gslrZZ0EtkZGidJuo3sdNZjUvMbgYck3Ul27vpHIqL2DBqr3jFkQw73kw35TY9W3X0120K07BCTmZlVqyX3IMzMrHotdyGx8ePHx8SJE+ta9/HHH2fs2LGDN2whzqk1OKfWMJpzWrp06YMR8Xc/ShxIyxWIiRMnsmTJkrrW7enpobOzc3gDajDn1BqcU2sYzTlJKv1r+T4eYjIzs0KVFQhJlym7lPIdA7TpVHbp4uWSflpVLGZmNnRV7kHMZYAL6qVfM38ZODpdRO/NFcZiZmZDVFmBKLqgXo23Aj+IiD+k9r6InplZE6n0dxD5y2EULPsCsA3ZzV52BL4YEd/qp59ZwCyA9vb2qd3d3XXF09vbS1tbf1eAbk3OqTU4p9YwmnPq6upaGhEdQ1q5yptNkN1opPCmPsDFwCKyO1qNB34LTB6sz6lTp0a9brrpprrXbVbOqTU4p9YwmnMClsQQP8MbeZrrauChyG4s8rikhcCBZLdvNDOzBmvkaa7XAIdJGpOug38I2X2HzcysCVS2B5EuqNcJjJe0muxGKNsARMTsiLhL0g3A7WQ3AvlaRPR7SqyZmY2sygpEZLeqHKzN+cD5VcVQa9l965l55g8BWPWZ143UZs3MWpJ/SW1mZoVcIMzMrJALhJmZFXKBMDOzQi4QZmZWyAXCzMwKuUCYmVkhFwgzMyvkAmFmZoVcIMzMrJALhJmZFXKBMDOzQi4QZmZWyAXCzMwKuUCYmVkhFwgzMyvkAmFmZoVcIMzMrFBlBULSZZLWShrwPtOSXippo6Rjq4rFzMyGrso9iLnAtIEaSNoaOA/4UYVxmJlZHSorEBGxEFg3SLP3Ad8H1lYVh5mZ1UcRUV3n0kTg2ojYv2DZBOByoAu4LLW7qp9+ZgGzANrb26d2d3fXFc/adetZsyGbnjJhXF19NJve3l7a2toaHcawck6twTm1hr6curq6lkZEx1DWHVNVUCV8ATgjIp6SNGDDiJgDzAHo6OiIzs7OujZ40bxruGBZlvKqGfX10Wx6enqo9/VoVs6pNTin1rA5OTWyQHQA3ak4jAeOlLQxIq5uYExmZpY0rEBExF5905Lmkg0xXd2oeMzMbFOVFQhJVwCdwHhJq4FPAtsARMTsqrZrZmbDo7ICERHHD6HtzKriMDOz+viX1GZmVsgFwszMCrlAmJlZIRcIMzMr5AJhZmaFXCDMzKyQC4SZmRVygTAzs0IuEGZmVsgFwszMCrlAmJlZIRcIMzMr5AJhZmaFXCDMzKyQC4SZmRVygTAzs0IuEGZmVqiyAiHpMklrJd3Rz/IZkm6XtEzSzyUdWFUsZmY2dFXuQcwFpg2w/B7gVRExBTgXmFNhLGZmNkRV3pN6oaSJAyz/eW52EbB7VbGYmdnQKSKq6zwrENdGxP6DtDsd2CciTu5n+SxgFkB7e/vU7u7uuuJZu249azZk01MmjKurj2bT29tLW1tbo8MYVs6pNTin1tCXU1dX19KI6BjKupXtQZQlqQs4CTisvzYRMYc0BNXR0RGdnZ11beuieddwwbIs5VUz6uuj2fT09FDv69GsnFNrcE6tYXNyamiBkHQA8DXgiIh4qJGxmJnZphp2mquk5wE/AE6IiN80Kg4zMytW2R6EpCuATmC8pNXAJ4FtACJiNnAWsAvwZUkAG4c6PmZmZtWp8iym4wdZfjJQeFDazMwaz7+kNjOzQi4QZmZWyAXCzMwKuUCYmVkhFwgzMyvkAmFmZoVcIMzMrJALhJmZFXKBMDOzQi4QZmZWyAXCzMwKuUCYmVkhFwgzMyvkAmFmZoVcIMzMrJALhJmZFXKBMDOzQi4QZmZWqLICIekySWsl3dHPckn6kqQVkm6X9JKqYjEzs6Grcg9iLjBtgOVHAJPSYxbwlQpjMTOzIaqsQETEQmDdAE2OAb4VmUXAzpJ2rSoeMzMbGkVEdZ1LE4FrI2L/gmXXAp+JiJ+l+QXAGRGxpKDtLLK9DNrb26d2d3fXFc/adetZsyGbnjJhXF19NJve3l7a2toaHcawck6twTm1hr6curq6lkZEx1DWHTNYA0lLgcuAyyPi4XqD3BwRMQeYA9DR0RGdnZ119XPRvGu4YFmW8qoZ9fXRbHp6eqj39WhWzqk1OKfWsDk5lRliOg7YDVgsqVvSayWprq1t6j5gj9z87uk5MzNrAoMWiIhYEREfAyYDl5PtTfxe0qck/cNmbHs+8PZ0NtOhwPqI+ONm9GdmZsNo0CEmAEkHACcCRwLfB+YBhwE/AQ7qZ50rgE5gvKTVwCeBbQAiYjZwXepvBfBE6t/MzJpE2WMQjwBfB86MiL+kRbdIekV/60XE8QP1G9nR8feWD9XMzEZSmT2IN0fEyqIFEfEvwxyPmZk1iTIHqU+WtHPfjKRnS/p0dSGZmVkzKFMgjoiIR/pm0qmuR1YWkZmZNYUyBWJrSdv1zUjaHthugPZmZjYKlDkGMQ9YIOkbaf5E4JvVhWRmZs1g0AIREedJuh14dXrq3Ii4sdqwzMys0Ur9DiIirgeurzgWMzNrIoMeg5D0L5J+K2m9pEclPSbp0ZEIzszMGqfMHsRngddHxF1VB2NmZs2jzFlMa1wczMy2PGX2IJZI+i5wNdB3mQ0i4gdVBWVmZo1XpkDsRHYxvX/OPReAC4SZ2ShW5jRXX2XVzGwLVOYspsmSFki6I80fIOnj1YdmZmaNVOYg9aXAR4EnASLidmB6lUGZmVnjlSkQO0TEL2ue21hFMGZm1jzKFIgHJe1NdmAaSccCvjWomdkoV6ZAvBf4KrCPpPuADwDvKdO5pGmS7pa0QtKZBcufJ+kmSb+SdLskX0bczKxJlDmLaSVwuKSxwFYR8ViZjiVtDVwCvAZYDSyWND8i7sw1+zhwZUR8RdK+ZPepnjjEHMzMrAJl7kl9Vs08ABFxziCrHgys6LtdqaRu4BggXyCC7HcWAOOA+0tFbWZmlVNEDNxA+nBu9lnAUcBdEfHOQdY7FpgWESen+ROAQyLi1FybXYEfAc8GxgKHR8TSgr5mAbMA2tvbp3Z3d5dI7e+tXbeeNRuy6SkTxtXVR7Pp7e2lra2t0WEMK+fUGpxTa+jLqaura2lEdAxl3TJDTBfk5yV9Dhiu+0EcD8yNiAskvQz4tqT9I+KpmhjmAHMAOjo6orOzs66NXTTvGi5YlqW8akZ9fTSbnp4e6n09mpVzag3OqTVsTk5lDlLX2gHYvUS7+4A9cvO7p+fyTgKuBIiIX5DtoYyvIyYzMxtmZY5BLCOd4gpsDTwHGOz4A8BiYJKkvcgKw3TgrTVt/kB2p7q5kl5EViD+VC50MzOrUpmL9R2Vm95IdvnvQX8oFxEbJZ1KNhy1NXBZRCyXdA6wJCLmAx8GLpX0QbIiNDMGOyhiZmYjokyBqD2tdae+M5kAImJdfytGxHVkp67mnzsrN30n8IpSkZqZ2YgqUyD+j+xYwsOAgJ3JhoYg+9b//EoiMzOzhipzkPrHZLccHR8Ru5ANOf0oIvaKCBcHM7NRqkyBODQNFQEQEdcDL68uJDMzawZlhpjuT/d/+E6an4F/8WxmNuqV2YM4nuzU1v8iu83oc9JzZmY2ipX5JfU64DRJYyPi8RGIyczMmkCZW46+XNKdwF1p/kBJX648MjMza6gyQ0wXAq8FHgKIiNuAf6wyKDMza7xS12KKiHtrnvpbBbGYmVkTKXMW072SXg6EpG2A00jDTWZmNnqV2YN4N9ltRyeQXXTvoDRvZmaj2IB7EOm2oV+MiBkjFI+ZmTWJAfcgIuJvwJ6Sth2heMzMrEmUOQaxErhZ0nzg6d9BRMTnK4vKzMwart89CEnfTpNHA9emtjvmHmZmNooNtAcxVdJuZJf2vmiE4jEzsyYxUIGYDSwA9gKW5J4Xvg+Emdmo1+8QU0R8KSJeBHwjIp6fe5S+D4SkaZLulrRC0pn9tHmLpDslLZd0eZ15mJnZMCtzsb731NNxOkX2EuA1wGpgsaT56TajfW0mAR8FXhERD0t6bj3bMjOz4VfqUht1OhhYERErI+KvQDdwTE2bdwGXRMTDABGxtsJ4zMxsCKosEBOA/DWcVqfn8iYDkyXdLGmRpGkVxmNmZkOgiKimY+lYYFpEnJzmTwAOiYhTc22uBZ4E3gLsDiwEpkTEIzV9zQJmAbS3t0/t7u6uK6a169azZkM2PWXCuLr6aDa9vb20tbU1Ooxh5Zxag3NqDX05dXV1LY2IjqGsW+aHcvW6D9gjN797ei5vNXBLRDwJ3CPpN8AkYHG+UUTMAeYAdHR0RGdnZ10BXTTvGi5YlqW8akZ9fTSbnp4e6n09mpVzag3OqTVsTk5VDjEtBiZJ2itdqmM6ML+mzdVAJ4Ck8WRDTisrjMnMzEqqrEBExEbgVOBGssuDXxkRyyWdI+no1OxG4KF0x7qbgI9ExENVxWRmZuVVOcRERFwHXFfz3Fm56QA+lB5mZtZEqhxiMjOzFuYCYWZmhVwgzMyskAuEmZkVcoEwM7NCLhBmZlbIBcLMzAq5QJiZWSEXCDMzK+QCYWZmhVwgzMyskAuEmZkVcoEwM7NCLhBmZlbIBcLMzAq5QJiZWSEXCDMzK+QCYWZmhSotEJKmSbpb0gpJZw7Q7k2SQlJHlfGYmVl5lRUISVsDlwBHAPsCx0vat6DdjsBpwC1VxWJmZkNX5R7EwcCKiFgZEX8FuoFjCtqdC5wH/LnCWMzMbIgUEdV0LB0LTIuIk9P8CcAhEXFqrs1LgI9FxJsk9QCnR8SSgr5mAbMA2tvbp3Z3d9cV09p161mzIZueMmFcXX00m97eXtra2hodxrByTq3BObWGvpy6urqWRsSQhvHHVBXUYCRtBXwemDlY24iYA8wB6OjoiM7Ozrq2edG8a7hgWZbyqhn19dFsenp6qPf1aFbOqTU4p9awOTlVOcR0H7BHbn739FyfHYH9gR5Jq4BDgfk+UG1m1hyqLBCLgUmS9pK0LTAdmN+3MCLWR8T4iJgYEROBRcDRRUNMZmY28iorEBGxETgVuBG4C7gyIpZLOkfS0VVt18zMhkelxyAi4jrguprnzuqnbWeVsZiZ2dD4l9RmZlbIBcLMzAq5QJiZWSEXCDMzK+QCYWZmhVwgzMyskAuEmZkVcoEwM7NCLhBmZlbIBcLMzAq5QJiZWSEXCDMzK+QCYWZmhVwgzMyskAuEmZkVcoEwM7NCLhBmZlao0gIhaZqkuyWtkHRmwfIPSbpT0u2SFkjas8p4zMysvMoKhKStgUuAI4B9geMl7VvT7FdAR0QcAFwFfLaqeMzMbGiq3IM4GFgRESsj4q9AN3BMvkFE3BQRT6TZRcDuFcZjZmZDoIiopmPpWGBaRJyc5k8ADomIU/tpfzHwQER8umDZLGAWQHt7+9Tu7u66Ylq7bj1rNmTTUyaMq6uPZtPb20tbW1ujwxhWzqk1OKfW0JdTV1fX0ojoGMq6Y6oKaigkvQ3oAF5VtDwi5gBzADo6OqKzs7Ou7Vw07xouWJalvGpGfX00m56eHup9PZqVc2oNzqk1bE5OVRaI+4A9cvO7p+c2Ielw4GPAqyLiLxXGY2ZmQ1DlMYjFwCRJe0naFpgOzM83kPRi4KvA0RGxtsJYzMxsiCorEBGxETgVuBG4C7gyIpZLOkfS0anZ+UAb8D1Jt0qa3093ZmY2wio9BhER1wHX1Tx3Vm768Cq3b2Zm9fMvqc3MrJALhJmZFXKBMDOzQi4QZmZWyAXCzMwKuUCYmVkhFwgzMyvkAmFmZoVcIMzMrJALhJmZFXKBMDOzQi4QZmZWyAXCzMwKuUCYmVkhFwgzMyvkAmFmZoVcIMzMrFCld5RrZhPP/OEm86s+87oGRWJm1pwq3YOQNE3S3ZJWSDqzYPl2kr6blt8iaWKV8ZiZWXmV7UFI2hq4BHgNsBpYLGl+RNyZa3YS8HBEvEDSdOA84LiqYhpIfo/CexNmZtUOMR0MrIiIlQCSuoFjgHyBOAY4O01fBVwsSRERFcY1qNrhpyIuImY22lVZICYA9+bmVwOH9NcmIjZKWg/sAjyYbyRpFjArzfZKurvOmMbX9l0vnTccvQyLYcupiTin1uCcWkNfTnsOdcWWOEgdEXOAOZvbj6QlEdExDCE1DefUGpxTa3BOm6ryIPV9wB65+d3Tc4VtJI0BxgEPVRiTmZmVVGWBWAxMkrSXpG2B6cD8mjbzgXek6WOBnzT6+IOZmWUqG2JKxxROBW4EtgYui4jlks4BlkTEfODrwLclrQDWkRWRKm32MFUTck6twTm1BueUI39hNzOzIr7UhpmZFXKBMDOzQqOyQIzGS3yUyOkfJf2fpI2Sjm1EjENVIqcPSbpT0u2SFkga8nncI61ETu+WtEzSrZJ+JmnfRsQ5FIPllGv3JkkhqelPEy3xPs2U9Kf0Pt0q6eRGxDkUZd4nSW9J/6eWS7p80E4jYlQ9yA6I/w54PrAtcBuwb02bU4DZaXo68N1Gxz0MOU0EDgC+BRzb6JiHKacuYIc0/Z5R8j7tlJs+Grih0XFvbk6p3Y7AQmAR0NHouIfhfZoJXNzoWIc5p0nAr4Bnp/nnDtbvaNyDePoSHxHxV6DvEh95xwDfTNNXAa+WpBGMcagGzSkiVkXE7cBTjQiwDmVyuikinkizi8h+S9PMyuT0aG52LNDsZ4mU+f8EcC7ZtdT+PJLB1alsTq2kTE7vAi6JiIcBImLtYJ2OxgJRdImPCf21iYiNQN8lPppVmZxazVBzOgm4vtKINl+pnCS9V9LvgM8C7x+h2Oo1aE6SXgLsERGDX8SsOZT923tTGt68StIeBcubSZmcJgOTJd0saZGkaYN1OhoLhI0ykt4GdADnNzqW4RARl0TE3sAZwMcbHc/mkLQV8Hngw42OZZj9NzAxIg4AfswzIw6tbAzZMFMncDxwqaSdB1phNBaI0XiJjzI5tZpSOUk6HPgYcHRE/GWEYqvXUN+nbuANVQY0DAbLaUdgf6BH0irgUGB+kx+oHvR9ioiHcn9vXwOmjlBs9Srzt7camB8RT0bEPcBvyApG/xp9cKWCgzVjgJXAXjxzsGa/mjbvZdOD1Fc2Ou7NzSnXdi6tcZC6zPv0YrIDb5MaHe8w5jQpN/16sqsKNDz2zcmppn0PzX+Qusz7tGtu+o3AokbHPQw5TQO+mabHkw1J7TJgv41OrKIX68hUHX8HfCw9dw7Zt1CAZwHfA1YAvwSe3+iYhyGnl5J9Q3icbG9oeaNjHoac/gdYA9yaHvMbHfMw5PRFYHnK56aBPmyb5TFYTjVtm75AlHyf/jO9T7el92mfRsc8DDmJbDjwTmAZMH2wPn2pDTMzKzQaj0GYmdkwcIEwM7NCLhBmZlbIBcLMzAq5QJiZWSEXCGsJkt4v6S5J8yQdPchVRWdKurifZdcN9uvRKqSruL49Tc+UtFuJdXqKfnCWz1/S2ZJOT9PnpB8WIukDknYY3ixsS1PZLUfNhtkpwOERsTrN197fvJSIOHL4QhrSdmfnZmcCdwD319nXfAryj4izcrMfAL4DPFHbzqws70FY05M0m+wyxtdL+mB+D0HSmyXdIek2SQtzq+0m6QZJv5X02VxfqySNlzQx7ZFcmq6N/yNJ26c2L00XabtV0vmS7iiIaVdJC1ObOyS9Mj3fm2tzrKS5afpsSaene3V0APPSuttLOkvS4tTPnJorC5+Q28bBqa/CPSRJc9M23w/sBtwk6SZJ75T0hVy7d0m6cKjvg215XCCs6UXEu8m+bXdFRO0H21nAayPiQLL7K/Q5CDgOmAIc18/VOCeRXf54P+AR4E3p+W8A/xoRBwF/6yestwI3pjYHkv0yukwuVwFLgBkRcVBEbCC778BLI2J/YHvgqNwqO6RtnAJcVnIbX+KZ16sLuBJ4vaRtUpMTy/ZlWzYXCGt1NwNzJb2L7KYpfRZExPqI+DPZpQWK7kZ3T0TcmqaXAhPT8YkdI+IX6fn+7rq1GDhR0tnAlIh4bDNy6FJ2Z8NlwD8B++WWXQEQEQuBneo5fhIRvcBPgKMk7QNsExHLNiNe20K4QFhLS3sXHye7kuVSSX339chf+fVvFB9vK9Omv+0uBP6R7IqZc/sOQLPpDYCeNVg/kp4FfJnsAotTgEtr1qu9Fk6918b5GtmxjxPJ9pDMBuUCYS1N0t4RcUs6QPsnNr3k8ZBFxCPAY5IOSU9N72e7ewJrIuJSsg/fl6RFayS9KN0n4Y39bOYxsstkwzPF4EFJbUDt/cSPS9s7DFgfEetLppLfBhFxC9lr81bSXonZYHwWk7W68yVNIrtS5QKyq28etJl9nkR2M5WngJ+S3XGwVifwEUlPAr1A3x7EmcC1ZMVqCdBWsO5cYLakDcDLyPYa7gAeIBu6yvuzpF8B2wDvHEIOc4AbJN2fjkNAdizioEi3nDQbjK/malZDUlsatyf93mDXiDitwWFtNknXAhdGxIJGx2KtwUNMZn/vdX2nlgKvBD7d6IA2h6SdJf0G2ODiYEPhPQgzMyvkPQgzMyvkAmFmZoVcIMzMrJALhJmZFXKBMDOzQv8fA8JUwxkG1tYAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# plot histogram of predictions\n",
"xgtest = xgb.DMatrix(test_data, test_labels)\n",
"xgtest_labels = bst.predict(xgtest)\n",
"\n",
"n, bins, patches = plt.hist(xgtest_labels, 100)\n",
"plt.xlabel('fishing suitability')\n",
"plt.ylabel('frequency')\n",
"plt.title('Histogram of predicted labels')\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Log Loss / Binary Cross Entropy = 0.0140\n",
"Test Log Loss / Binary Cross Entropy = 1.1559\n",
"Confusion matrix : \n",
" [[ 1407 129657]\n",
" [ 4407 1503837]]\n",
"Classification report : \n",
" precision recall f1-score support\n",
"\n",
" 1 0.24 0.01 0.02 131064\n",
" 0 0.92 1.00 0.96 1508244\n",
"\n",
" accuracy 0.92 1639308\n",
" macro avg 0.58 0.50 0.49 1639308\n",
"weighted avg 0.87 0.92 0.88 1639308\n",
"\n"
]
}
],
"source": [
"# accuracy assessment\n",
"# binary cross entropy for regression, f1 and confusion matrix for classification\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import log_loss, confusion_matrix, classification_report\n",
"\n",
"# convert dataframe array to numpy\n",
"train_labels_np = train_labels.to_numpy()\n",
"test_labels_np = test_labels.to_numpy()\n",
"\n",
"# calculate train loss\n",
"train_loss = log_loss(train_labels_np, bst.predict(xgtrain)) # actual, predicted\n",
"print('Train Log Loss / Binary Cross Entropy = {:.4f}'.format(train_loss))\n",
"\n",
"# calculate test loss\n",
"# xgtest_labels = bst.predict(xgtest)\n",
"test_loss = log_loss(test_labels_np, xgtest_labels) # actual, predicted\n",
"print('Test Log Loss / Binary Cross Entropy = {:.4f}'.format(test_loss))\n",
"\n",
"# regression to classification\n",
"# assign 1 if score is at least 0.1, 0 otherwise\n",
"pos_thres = 0.05\n",
"xgtest_labels_thres = np.array([1 if x >= pos_thres else 0 for x in xgtest_labels])\n",
"\n",
"# confusion matrix\n",
"matrix = confusion_matrix(test_labels_np, xgtest_labels_thres, labels=[1, 0])\n",
"print('Confusion matrix : \\n',matrix)\n",
"\n",
"# precision/recall/f1\n",
"report = classification_report(test_labels_np, xgtest_labels_thres, labels=[1, 0]) # actual then predicted\n",
"print('Classification report : \\n', report)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# histogram of true positives\n",
"# xgtest_labels_thres_prod = np.multiply(xgtest_labels_thres, test_labels_np)\n",
"temp = np.zeros(np.size(xgtest_labels))\n",
"temp[:] = np.nan\n",
"\n",
"for i in range(np.size(xgtest_labels)):\n",
" prod = test_labels_np[i] * xgtest_labels[i]\n",
" if prod > 0.05:\n",
" temp[i] = prod\n",
"\n",
"n, bins, patches = plt.hist(temp, 100)\n",
"plt.xlabel('fishing suitability')\n",
"plt.ylabel('frequency')\n",
"plt.title('Histogram of positive samples')\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"# save to csv file: test data + predictions\n",
"prod_data = pd.read_csv('/tmp/testing-2021.csv')\n",
"prod_labels = pd.DataFrame({'fishing_suitability': xgtest_labels})\n",
"\n",
"pd.concat([prod_data, prod_labels], axis=1).to_csv('/tmp/testing-2021-raw.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment