import pandas as pd
import json
import warnings
warnings.filterwarnings('ignore')
import h2o
h2o.init(min_mem_size='25G')
DATA_LOCATION = "../../data/"
MODELS_LOCATION = "../../models/ALL_FEATURES/FINAL"
Checking whether there is an H2O instance running at http://localhost:54321 . connected. Warning: Your H2O cluster version is too old (5 months and 12 days)! Please download and install the latest version from http://h2o.ai/download/
H2O_cluster_uptime: | 2 mins 59 secs |
H2O_cluster_timezone: | Etc/UTC |
H2O_data_parsing_timezone: | UTC |
H2O_cluster_version: | 3.30.0.4 |
H2O_cluster_version_age: | 5 months and 12 days !!! |
H2O_cluster_name: | H2O_from_python_azureuser_hzoube |
H2O_cluster_total_nodes: | 1 |
H2O_cluster_free_memory: | 23.22 Gb |
H2O_cluster_total_cores: | 4 |
H2O_cluster_allowed_cores: | 4 |
H2O_cluster_status: | locked, healthy |
H2O_connection_url: | http://localhost:54321 |
H2O_connection_proxy: | {"http": null, "https": null} |
H2O_internal_security: | False |
H2O_API_Extensions: | Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 |
Python_version: | 3.6.9 final |
train = h2o.import_file( DATA_LOCATION + "processed/final.train.tsv")
train.head()
Parse progress: |█████████████████████████████████████████████████████████| 100%
SampleID | NC000962_3.22 | NC000962_3.434 | NC000962_3.524 | NC000962_3.645 | NC000962_3.648 | NC000962_3.654 | NC000962_3.666 | NC000962_3.675 | NC000962_3.678 | NC000962_3.693 | NC000962_3.698 | NC000962_3.699 | NC000962_3.702 | NC000962_3.705 | NC000962_3.708 | NC000962_3.717 | NC000962_3.729 | NC000962_3.741 | NC000962_3.744 | NC000962_3.747 | NC000962_3.750 | NC000962_3.756 | NC000962_3.770 | NC000962_3.777 | NC000962_3.780 | NC000962_3.783 | NC000962_3.793 | NC000962_3.795 | NC000962_3.799 | NC000962_3.801 | NC000962_3.805 | NC000962_3.822 | NC000962_3.840 | NC000962_3.846 | NC000962_3.849 | NC000962_3.852 | NC000962_3.1045 | NC000962_3.1049 | NC000962_3.1089 | NC000962_3.1123 | NC000962_3.1152 | NC000962_3.1153 | NC000962_3.1155 | NC000962_3.1161 | NC000962_3.1164 | NC000962_3.1166 | NC000962_3.1167 | NC000962_3.1176 | NC000962_3.1206 | NC000962_3.1212 | NC000962_3.1255 | NC000962_3.1278 | NC000962_3.1291 | NC000962_3.1302 | NC000962_3.1326 | NC000962_3.1389 | NC000962_3.1399 | NC000962_3.1413 | NC000962_3.1416 | NC000962_3.1422 | NC000962_3.1429 | NC000962_3.1431 | NC000962_3.1432 | NC000962_3.1452 | NC000962_3.1458 | NC000962_3.1461 | NC000962_3.1470 | NC000962_3.1473 | NC000962_3.1474 | NC000962_3.1653 | NC000962_3.1676 | NC000962_3.1699 | NC000962_3.1703 | NC000962_3.1708 | NC000962_3.1718 | NC000962_3.1729 | NC000962_3.1771 | NC000962_3.1827 | NC000962_3.1849 | NC000962_3.1918 | NC000962_3.1977 | NC000962_3.2532 | NC000962_3.2745 | NC000962_3.3352 | NC000962_3.3446 | NC000962_3.4013 | NC000962_3.4086 | NC000962_3.4096 | NC000962_3.4119 | NC000962_3.4938 | NC000962_3.5075 | NC000962_3.5627 | NC000962_3.5782 | NC000962_3.5790 | NC000962_3.5791 | NC000962_3.5803 | NC000962_3.5807 | NC000962_3.5824 | NC000962_3.5839 | NC000962_3.5848 | NC000962_3.5856 | NC000962_3.5858 | NC000962_3.5860 | NC000962_3.5902 | NC000962_3.6003 | NC000962_3.6013 | NC000962_3.6112 | NC000962_3.6280 | NC000962_3.6286 | NC000962_3.6292 | NC000962_3.6307 | NC000962_3.6362 | NC000962_3.6382 | NC000962_3.6388 | NC000962_3.6403 | NC000962_3.6430 | NC000962_3.6436 | NC000962_3.6439 | NC000962_3.6445 | NC000962_3.6452 | NC000962_3.6453 | NC000962_3.6502 | NC000962_3.6508 | NC000962_3.6511 | NC000962_3.6515 | NC000962_3.6520 | NC000962_3.6535 | NC000962_3.6547 | NC000962_3.6550 | NC000962_3.6551 | NC000962_3.6553 | NC000962_3.6571 | NC000962_3.6575 | NC000962_3.6579 | NC000962_3.6586 | NC000962_3.6620 | NC000962_3.6638 | NC000962_3.6695 | NC000962_3.6735 | NC000962_3.6738 | NC000962_3.6742 | NC000962_3.6749 | NC000962_3.6750 | NC000962_3.6807 | NC000962_3.6878 | NC000962_3.6881 | NC000962_3.7058 | NC000962_3.7088 | NC000962_3.7170 | NC000962_3.7268 | NC000962_3.7355 | NC000962_3.7362 | NC000962_3.7496 | NC000962_3.7563 | NC000962_3.7564 | NC000962_3.7566 | NC000962_3.7567 | NC000962_3.7570 | NC000962_3.7572 | NC000962_3.7581 | NC000962_3.7582 | NC000962_3.7585 | NC000962_3.7607 | NC000962_3.7631 | NC000962_3.7637 | NC000962_3.7652 | NC000962_3.7658 | NC000962_3.7664 | NC000962_3.7683 | NC000962_3.7685 | NC000962_3.7694 | NC000962_3.7710 | NC000962_3.7712 | NC000962_3.7725 | NC000962_3.7728 | NC000962_3.7730 | NC000962_3.7892 | NC000962_3.8040 | NC000962_3.8164 | NC000962_3.8201 | NC000962_3.8434 | NC000962_3.8452 | NC000962_3.8519 | NC000962_3.8619 | NC000962_3.8624 | NC000962_3.9023 | NC000962_3.9032 | NC000962_3.9034 | NC000962_3.9050 | NC000962_3.9051 | NC000962_3.9113 | NC000962_3.9119 | NC000962_3.9134 | NC000962_3.9143 | NC000962_3.9145 | NC000962_3.9147 | NC000962_3.9153 | NC000962_3.9154 | NC000962_3.9155 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SRR10525336 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SRR10380004 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SRR6807701 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SRR11033700 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SRR1163101 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SRR7592336 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SRR1163415 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SRR6458388 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SRR5153333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SRR5152963 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
test = h2o.import_file(DATA_LOCATION + "processed/final.test.tsv")
test.head()
Parse progress: |█████████████████████████████████████████████████████████| 100%
SampleID | NC000962_3.22 | NC000962_3.434 | NC000962_3.524 | NC000962_3.645 | NC000962_3.648 | NC000962_3.654 | NC000962_3.666 | NC000962_3.675 | NC000962_3.678 | NC000962_3.693 | NC000962_3.698 | NC000962_3.699 | NC000962_3.702 | NC000962_3.705 | NC000962_3.708 | NC000962_3.717 | NC000962_3.729 | NC000962_3.741 | NC000962_3.744 | NC000962_3.747 | NC000962_3.750 | NC000962_3.756 | NC000962_3.770 | NC000962_3.777 | NC000962_3.780 | NC000962_3.783 | NC000962_3.793 | NC000962_3.795 | NC000962_3.799 | NC000962_3.801 | NC000962_3.805 | NC000962_3.822 | NC000962_3.840 | NC000962_3.846 | NC000962_3.849 | NC000962_3.852 | NC000962_3.1045 | NC000962_3.1049 | NC000962_3.1089 | NC000962_3.1123 | NC000962_3.1152 | NC000962_3.1153 | NC000962_3.1155 | NC000962_3.1161 | NC000962_3.1164 | NC000962_3.1166 | NC000962_3.1167 | NC000962_3.1176 | NC000962_3.1206 | NC000962_3.1212 | NC000962_3.1255 | NC000962_3.1278 | NC000962_3.1291 | NC000962_3.1302 | NC000962_3.1326 | NC000962_3.1389 | NC000962_3.1399 | NC000962_3.1413 | NC000962_3.1416 | NC000962_3.1422 | NC000962_3.1429 | NC000962_3.1431 | NC000962_3.1432 | NC000962_3.1452 | NC000962_3.1458 | NC000962_3.1461 | NC000962_3.1470 | NC000962_3.1473 | NC000962_3.1474 | NC000962_3.1653 | NC000962_3.1676 | NC000962_3.1699 | NC000962_3.1703 | NC000962_3.1708 | NC000962_3.1718 | NC000962_3.1729 | NC000962_3.1771 | NC000962_3.1827 | NC000962_3.1849 | NC000962_3.1918 | NC000962_3.1977 | NC000962_3.2532 | NC000962_3.2745 | NC000962_3.3352 | NC000962_3.3446 | NC000962_3.4013 | NC000962_3.4086 | NC000962_3.4096 | NC000962_3.4119 | NC000962_3.4938 | NC000962_3.5075 | NC000962_3.5627 | NC000962_3.5782 | NC000962_3.5790 | NC000962_3.5791 | NC000962_3.5803 | NC000962_3.5807 | NC000962_3.5824 | NC000962_3.5839 | NC000962_3.5848 | NC000962_3.5856 | NC000962_3.5858 | NC000962_3.5860 | NC000962_3.5902 | NC000962_3.6003 | NC000962_3.6013 | NC000962_3.6112 | NC000962_3.6280 | NC000962_3.6286 | NC000962_3.6292 | NC000962_3.6307 | NC000962_3.6362 | NC000962_3.6382 | NC000962_3.6388 | NC000962_3.6403 | NC000962_3.6430 | NC000962_3.6436 | NC000962_3.6439 | NC000962_3.6445 | NC000962_3.6452 | NC000962_3.6453 | NC000962_3.6502 | NC000962_3.6508 | NC000962_3.6511 | NC000962_3.6515 | NC000962_3.6520 | NC000962_3.6535 | NC000962_3.6547 | NC000962_3.6550 | NC000962_3.6551 | NC000962_3.6553 | NC000962_3.6571 | NC000962_3.6575 | NC000962_3.6579 | NC000962_3.6586 | NC000962_3.6620 | NC000962_3.6638 | NC000962_3.6695 | NC000962_3.6735 | NC000962_3.6738 | NC000962_3.6742 | NC000962_3.6749 | NC000962_3.6750 | NC000962_3.6807 | NC000962_3.6878 | NC000962_3.6881 | NC000962_3.7058 | NC000962_3.7088 | NC000962_3.7170 | NC000962_3.7268 | NC000962_3.7355 | NC000962_3.7362 | NC000962_3.7496 | NC000962_3.7563 | NC000962_3.7564 | NC000962_3.7566 | NC000962_3.7567 | NC000962_3.7570 | NC000962_3.7572 | NC000962_3.7581 | NC000962_3.7582 | NC000962_3.7585 | NC000962_3.7607 | NC000962_3.7631 | NC000962_3.7637 | NC000962_3.7652 | NC000962_3.7658 | NC000962_3.7664 | NC000962_3.7683 | NC000962_3.7685 | NC000962_3.7694 | NC000962_3.7710 | NC000962_3.7712 | NC000962_3.7725 | NC000962_3.7728 | NC000962_3.7730 | NC000962_3.7892 | NC000962_3.8040 | NC000962_3.8164 | NC000962_3.8201 | NC000962_3.8434 | NC000962_3.8452 | NC000962_3.8519 | NC000962_3.8619 | NC000962_3.8624 | NC000962_3.9023 | NC000962_3.9032 | NC000962_3.9034 | NC000962_3.9050 | NC000962_3.9051 | NC000962_3.9113 | NC000962_3.9119 | NC000962_3.9134 | NC000962_3.9143 | NC000962_3.9145 | NC000962_3.9147 | NC000962_3.9153 | NC000962_3.9154 | NC000962_3.9155 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ERR3335735 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SRR8552929 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ERR067629 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ERR067714 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SRR5065314 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ERR067659 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ERR067590 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ERR688027 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ERR3335727 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ERR3335759 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# Identify predictors and response
train_predictor_cols = train.columns
train_response_col = "Resistance_Status"
train_predictor_cols.remove('SampleID')
train_predictor_cols.remove(train_response_col)
print("train frame - predictor column: ", train_predictor_cols[0], train_predictor_cols[-1])
print("train frame - response column: ", train_response_col)
# Identify predictors and response
test_predictor_cols = test.columns
test_response_col = "Resistance_Status"
test_predictor_cols.remove('SampleID')
test_predictor_cols.remove(test_response_col)
print("test frame - predictor columns: ", test_predictor_cols[0], test_predictor_cols[-1])
print("test frame - response column: ", test_response_col)
train frame - predictor column: NC000962_3.22 NC000962_3.4411327 train frame - response column: Resistance_Status test frame - predictor columns: NC000962_3.22 NC000962_3.4411327 test frame - response column: Resistance_Status
# For binary classification, response should be a factor
train[train_response_col] = train[train_response_col].asfactor()
test[test_response_col] = test[test_response_col].asfactor()
# Number of CV folds (to generate level-one data for stacking)
nfolds = 5
MAX_GRID_MODELS = 10
x = train_predictor_cols
y = train_response_col
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2ONaiveBayesEstimator
hyper_params = {
"laplace": [0.1, 0.3, 0.6, 0.9, 1.0],
"min_sdev":[0.1, 0.3, 0.6, 0.9, 1.0],
"eps_sdev":[0.1, 0.3, 0.6, 0.9, 1.0],
"min_prob":[0.1, 0.3, 0.6, 0.9, 1.0],
"eps_prob":[0.1, 0.3, 0.6, 0.9, 1.0],
# "compute_metrics": [True, False]
}
search_criteria = {"strategy": "RandomDiscrete",
"max_models": MAX_GRID_MODELS}
base_model = H2ONaiveBayesEstimator(
nfolds=nfolds,
fold_assignment = "random",
keep_cross_validation_predictions = True,
seed=1234)
nb_grid = H2OGridSearch(model=base_model,
hyper_params=hyper_params,
search_criteria=search_criteria)
nb_grid.train(x=x, y=y, training_frame=train, validation_frame=test)
naivebayes Grid Build progress: |█████████████████████████████████████████| 100%
h2o.save_grid( MODELS_LOCATION + "nb_grid", nb_grid.grid_id)
dict_keys(['Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_10', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_11', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_9', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_3', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_18', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_8', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_1', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_2', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_7', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_14', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_16', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_15', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_20', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_6', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_12', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_13', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_5', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_4', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_17', 'Grid_NaiveBayes_py_5_sid_8e0c_model_python_1604225741410_1205_model_19'])
sorted_grid = nb_grid.get_grid(sort_by='auc', decreasing=True)
top_nb_model_ids = sorted_grid.model_ids[:MAX_GRID_MODELS]
top_nb_model_id_params_dict = {}
for mdl_id in top_nb_model_ids:
model = h2o.get_model(mdl_id)
top_nb_model_id_params_dict[mdl_id] = {'params': model.actual_params,
'auc': model.auc(),
}
with open(DATA_LOCATION + 'top_nb_models.json', 'w') as json_file:
json.dump(top_nb_model_id_params_dict, json_file)
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2OGeneralizedLinearEstimator
hyper_params = {
# lambda //use self.lambda_
"alpha" : [0, 0.3, 0.6, 0.9, 1],
# "missing_values_handling" : ["mean_imputation", "skip", "plug_values"],
"theta" : [0, 0.3, 0.6, 0.9, 1],
"tweedie_link_power" : [0, 0.3, 0.6, 0.9, 1, 3, 6, 9],
"tweedie_variance_power" : [0, 0.3, 0.6, 0.9, 1, 3, 6, 9],
}
search_criteria = {"strategy": "RandomDiscrete",
"max_models": MAX_GRID_MODELS}
# Train and cross-validate a NB
base_model = H2OGeneralizedLinearEstimator(
family= "binomial",
nfolds=nfolds,
fold_assignment = "random",
keep_cross_validation_predictions = True,
seed=1234)
# Train the grid
glm_grid = H2OGridSearch(model=base_model,
hyper_params=hyper_params,
search_criteria=search_criteria)
glm_grid.train(x=x, y=y, training_frame=train, validation_frame=test)
h2o.save_grid(MODELS_LOCATION + "glm_grid", glm_grid.grid_id)
sorted_grid = glm_grid.get_grid(sort_by='auc', decreasing=True)
top_glm_model_ids = sorted_grid.model_ids[:MAX_GRID_MODELS]
top_glm_model_id_params_dict = {}
for mdl_id in top_glm_model_ids:
model = h2o.get_model(mdl_id)
top_glm_model_id_params_dict[mdl_id] = {'params': model.actual_params,
'auc': model.auc(),
}
with open(DATA_LOCATION + 'top_glm_models.json', 'w') as json_file:
json.dump(top_glm_model_id_params_dict, json_file)
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2OGradientBoostingEstimator
hyper_params = {
'learn_rate': [0.1, 0.3, 0.6, 0.9],
'learn_rate_annealing': [0.1, 0.3, 0.6, 0.9, 1],
'distribution': ['bernoulli', 'multinomial'],
'quantile_alpha':[0.1, 0.3, 0.5, 0.8, 1],
'tweedie_power': [0,1, 1.5,1.9],
'col_sample_rate': [0.1, 0.3, 0.7, 0.9],
'balance_classes': [True, False],
'ntrees': [10, 20, 50, 100, 150],
'max_depth': [5, 10, 15, 20], # defaults to 20
'sample_rate': [ 0.1, 0.3, 0.6, 0.9],
'col_sample_rate_per_tree': [ 0.1, 0.3, 0.6, 0.8, 1],
'col_sample_rate_change_per_level': [ 0.1, 0.3, 0.6, 0.8, 1, 1.3, 1.5, 1.7, 1.9],
'histogram_type': ["AUTO", "UniformAdaptive", "Random", "QuantilesGlobal", "RoundRobin"]
#'max_abs_leafnode_pred' # use default value
#'class_sampling_factors',
#'max_after_balance_size',
#'min_rows', # defaults to 1
#'nbins', # default is 20
#'nbins_top_level', # requires too much tuning
#'nbins_cats', # requires too much tuning
#'r2_stopping',
#'seed',
#'build_tree_one_node',
#'sample_rate_per_class':[ 0.1, 0.3, 0.6, 0.9],
#'score_tree_interval',
#'min_split_improvement',
}
search_criteria = {"strategy": "RandomDiscrete",
"max_models": MAX_GRID_MODELS}
base_model = H2OGradientBoostingEstimator(
nfolds=nfolds,
fold_assignment = "random",
keep_cross_validation_predictions = True,
seed=1234
)
# Train the grid
gbm_grid = H2OGridSearch(model=base_model,
hyper_params=hyper_params,
search_criteria=search_criteria,
parallelism= 1)
gbm_grid.train(x=x, y=y, training_frame=train, validation_frame=test)
gbm Grid Build progress: |████████████████████████████████████████████████
h2o.save_grid(MODELS_LOCATION + "gbm_grid", gbm_grid.grid_id)
'../models/ALL_FEATURES/gbm_grid/Grid_GBM_py_7_sid_9651_model_python_1604407520638_1'
sorted_grid = gbm_grid.get_grid(sort_by='auc', decreasing=True)
top_gbm_model_ids = sorted_grid.model_ids[:MAX_GRID_MODELS]
top_gbm_model_id_params_dict = {}
for mdl_id in top_gbm_model_ids:
model = h2o.get_model(mdl_id)
top_gbm_model_id_params_dict[mdl_id] = {'params': model.actual_params,
'auc': model.auc(),
}
with open(DATA_LOCATION + 'top_gbm_models.json', 'w') as json_file:
json.dump(top_gbm_model_id_params_dict, json_file)
{'Grid_GBM_py_7_sid_9651_model_python_1604407520638_1_model_7': {'params': {'model_id': 'Grid_GBM_py_7_sid_9651_model_python_1604407520638_1_model_7', 'training_frame': 'py_7_sid_9651', 'validation_frame': 'py_8_sid_9651', 'nfolds': 5, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': True, 'keep_cross_validation_fold_assignment': False, 'score_each_iteration': False, 'score_tree_interval': 0, 'fold_assignment': 'Random', 'fold_column': None, 'response_column': 'Resistance_Status', 'ignored_columns': ['SampleID'], 'ignore_const_cols': True, 'offset_column': None, 'weights_column': None, 'balance_classes': False, 'class_sampling_factors': None, 'max_after_balance_size': 5.0, 'max_confusion_matrix_size': 20, 'max_hit_ratio_k': 0, 'ntrees': 50, 'max_depth': 5, 'min_rows': 10.0, 'nbins': 20, 'nbins_top_level': 1024, 'nbins_cats': 1024, 'r2_stopping': 1.7976931348623157e+308, 'stopping_rounds': 0, 'stopping_metric': 'AUTO', 'stopping_tolerance': 0.001, 'max_runtime_secs': 0.0, 'seed': 1234, 'build_tree_one_node': False, 'learn_rate': 0.1, 'learn_rate_annealing': 0.9, 'distribution': 'bernoulli', 'quantile_alpha': 1.0, 'tweedie_power': 1.9, 'huber_alpha': 0.9, 'checkpoint': None, 'sample_rate': 0.9, 'sample_rate_per_class': None, 'col_sample_rate': 0.3, 'col_sample_rate_change_per_level': 0.8, 'col_sample_rate_per_tree': 0.6, 'min_split_improvement': 1e-05, 'histogram_type': 'Random', 'max_abs_leafnode_pred': 1.7976931348623157e+308, 'pred_noise_bandwidth': 0.0, 'categorical_encoding': 'AUTO', 'calibrate_model': False, 'calibration_frame': None, 'custom_metric_func': None, 'custom_distribution_func': None, 'export_checkpoints_dir': None, 'monotone_constraints': None, 'check_constant_response': True}, 'auc': 0.9466164302600473}, 'Grid_GBM_py_7_sid_9651_model_python_1604407520638_1_model_5': {'params': {'model_id': 'Grid_GBM_py_7_sid_9651_model_python_1604407520638_1_model_5', 'training_frame': 'py_7_sid_9651', 'validation_frame': 'py_8_sid_9651', 'nfolds': 5, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': True, 'keep_cross_validation_fold_assignment': False, 'score_each_iteration': False, 'score_tree_interval': 0, 'fold_assignment': 'Random', 'fold_column': None, 'response_column': 'Resistance_Status', 'ignored_columns': ['SampleID'], 'ignore_const_cols': True, 'offset_column': None, 'weights_column': None, 'balance_classes': False, 'class_sampling_factors': None, 'max_after_balance_size': 5.0, 'max_confusion_matrix_size': 20, 'max_hit_ratio_k': 0, 'ntrees': 12, 'max_depth': 5, 'min_rows': 10.0, 'nbins': 20, 'nbins_top_level': 1024, 'nbins_cats': 1024, 'r2_stopping': 1.7976931348623157e+308, 'stopping_rounds': 0, 'stopping_metric': 'AUTO', 'stopping_tolerance': 0.001, 'max_runtime_secs': 0.0, 'seed': 1234, 'build_tree_one_node': False, 'learn_rate': 0.3, 'learn_rate_annealing': 0.3, 'distribution': 'multinomial', 'quantile_alpha': 0.8, 'tweedie_power': 1.5, 'huber_alpha': 0.9, 'checkpoint': None, 'sample_rate': 0.6, 'sample_rate_per_class': None, 'col_sample_rate': 0.7, 'col_sample_rate_change_per_level': 1.7, 'col_sample_rate_per_tree': 0.8, 'min_split_improvement': 1e-05, 'histogram_type': 'UniformAdaptive', 'max_abs_leafnode_pred': 1.7976931348623157e+308, 'pred_noise_bandwidth': 0.0, 'categorical_encoding': 'AUTO', 'calibrate_model': False, 'calibration_frame': None, 'custom_metric_func': None, 'custom_distribution_func': None, 'export_checkpoints_dir': None, 'monotone_constraints': None, 'check_constant_response': True}, 'auc': 0.9268247635933806}, 'Grid_GBM_py_7_sid_9651_model_python_1604407520638_1_model_3': {'params': {'model_id': 'Grid_GBM_py_7_sid_9651_model_python_1604407520638_1_model_3', 'training_frame': 'py_7_sid_9651', 'validation_frame': 'py_8_sid_9651', 'nfolds': 5, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': True, 'keep_cross_validation_fold_assignment': False, 'score_each_iteration': False, 'score_tree_interval': 0, 'fold_assignment': 'Random', 'fold_column': None, 'response_column': 'Resistance_Status', 'ignored_columns': ['SampleID'], 'ignore_const_cols': True, 'offset_column': None, 'weights_column': None, 'balance_classes': False, 'class_sampling_factors': None, 'max_after_balance_size': 5.0, 'max_confusion_matrix_size': 20, 'max_hit_ratio_k': 0, 'ntrees': 7, 'max_depth': 20, 'min_rows': 10.0, 'nbins': 20, 'nbins_top_level': 1024, 'nbins_cats': 1024, 'r2_stopping': 1.7976931348623157e+308, 'stopping_rounds': 0, 'stopping_metric': 'AUTO', 'stopping_tolerance': 0.001, 'max_runtime_secs': 0.0, 'seed': 1234, 'build_tree_one_node': False, 'learn_rate': 0.3, 'learn_rate_annealing': 0.1, 'distribution': 'bernoulli', 'quantile_alpha': 0.8, 'tweedie_power': 1.9, 'huber_alpha': 0.9, 'checkpoint': None, 'sample_rate': 0.6, 'sample_rate_per_class': None, 'col_sample_rate': 0.7, 'col_sample_rate_change_per_level': 1.3, 'col_sample_rate_per_tree': 0.8, 'min_split_improvement': 1e-05, 'histogram_type': 'RoundRobin', 'max_abs_leafnode_pred': 1.7976931348623157e+308, 'pred_noise_bandwidth': 0.0, 'categorical_encoding': 'AUTO', 'calibrate_model': False, 'calibration_frame': None, 'custom_metric_func': None, 'custom_distribution_func': None, 'export_checkpoints_dir': None, 'monotone_constraints': None, 'check_constant_response': True}, 'auc': 0.9481693262411347}, 'Grid_GBM_py_7_sid_9651_model_python_1604407520638_1_model_10': {'params': {'model_id': 'Grid_GBM_py_7_sid_9651_model_python_1604407520638_1_model_10', 'training_frame': 'py_7_sid_9651', 'validation_frame': 'py_8_sid_9651', 'nfolds': 5, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': True, 'keep_cross_validation_fold_assignment': False, 'score_each_iteration': False, 'score_tree_interval': 0, 'fold_assignment': 'Random', 'fold_column': None, 'response_column': 'Resistance_Status', 'ignored_columns': ['SampleID'], 'ignore_const_cols': True, 'offset_column': None, 'weights_column': None, 'balance_classes': True, 'class_sampling_factors': None, 'max_after_balance_size': 5.0, 'max_confusion_matrix_size': 20, 'max_hit_ratio_k': 0, 'ntrees': 10, 'max_depth': 10, 'min_rows': 10.0, 'nbins': 20, 'nbins_top_level': 1024, 'nbins_cats': 1024, 'r2_stopping': 1.7976931348623157e+308, 'stopping_rounds': 0, 'stopping_metric': 'AUTO', 'stopping_tolerance': 0.001, 'max_runtime_secs': 0.0, 'seed': 1234, 'build_tree_one_node': False, 'learn_rate': 0.3, 'learn_rate_annealing': 0.9, 'distribution': 'multinomial', 'quantile_alpha': 0.5, 'tweedie_power': 1.9, 'huber_alpha': 0.9, 'checkpoint': None, 'sample_rate': 0.3, 'sample_rate_per_class': None, 'col_sample_rate': 0.3, 'col_sample_rate_change_per_level': 0.1, 'col_sample_rate_per_tree': 0.1, 'min_split_improvement': 1e-05, 'histogram_type': 'AUTO', 'max_abs_leafnode_pred': 1.7976931348623157e+308, 'pred_noise_bandwidth': 0.0, 'categorical_encoding': 'AUTO', 'calibrate_model': False, 'calibration_frame': None, 'custom_metric_func': None, 'custom_distribution_func': None, 'export_checkpoints_dir': None, 'monotone_constraints': None, 'check_constant_response': True}, 'auc': 0.8624969059405941}, 'Grid_GBM_py_7_sid_9651_model_python_1604407520638_1_model_1': {'params': {'model_id': 'Grid_GBM_py_7_sid_9651_model_python_1604407520638_1_model_1', 'training_frame': 'py_7_sid_9651', 'validation_frame': 'py_8_sid_9651', 'nfolds': 5, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': True, 'keep_cross_validation_fold_assignment': False, 'score_each_iteration': False, 'score_tree_interval': 0, 'fold_assignment': 'Random', 'fold_column': None, 'response_column': 'Resistance_Status', 'ignored_columns': ['SampleID'], 'ignore_const_cols': True, 'offset_column': None, 'weights_column': None, 'balance_classes': True, 'class_sampling_factors': None, 'max_after_balance_size': 5.0, 'max_confusion_matrix_size': 20, 'max_hit_ratio_k': 0, 'ntrees': 10, 'max_depth': 10, 'min_rows': 10.0, 'nbins': 20, 'nbins_top_level': 1024, 'nbins_cats': 1024, 'r2_stopping': 1.7976931348623157e+308, 'stopping_rounds': 0, 'stopping_metric': 'AUTO', 'stopping_tolerance': 0.001, 'max_runtime_secs': 0.0, 'seed': 1234, 'build_tree_one_node': False, 'learn_rate': 0.9, 'learn_rate_annealing': 0.9, 'distribution': 'bernoulli', 'quantile_alpha': 0.3, 'tweedie_power': 1.9, 'huber_alpha': 0.9, 'checkpoint': None, 'sample_rate': 0.1, 'sample_rate_per_class': None, 'col_sample_rate': 0.3, 'col_sample_rate_change_per_level': 1.7, 'col_sample_rate_per_tree': 1.0, 'min_split_improvement': 1e-05, 'histogram_type': 'RoundRobin', 'max_abs_leafnode_pred': 1.7976931348623157e+308, 'pred_noise_bandwidth': 0.0, 'categorical_encoding': 'AUTO', 'calibrate_model': False, 'calibration_frame': None, 'custom_metric_func': None, 'custom_distribution_func': None, 'export_checkpoints_dir': None, 'monotone_constraints': None, 'check_constant_response': True}, 'auc': 0.8909367264851485}, 'Grid_GBM_py_7_sid_9651_model_python_1604407520638_1_model_9': {'params': {'model_id': 'Grid_GBM_py_7_sid_9651_model_python_1604407520638_1_model_9', 'training_frame': 'py_7_sid_9651', 'validation_frame': 'py_8_sid_9651', 'nfolds': 5, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': True, 'keep_cross_validation_fold_assignment': False, 'score_each_iteration': False, 'score_tree_interval': 0, 'fold_assignment': 'Random', 'fold_column': None, 'response_column': 'Resistance_Status', 'ignored_columns': ['SampleID'], 'ignore_const_cols': True, 'offset_column': None, 'weights_column': None, 'balance_classes': True, 'class_sampling_factors': None, 'max_after_balance_size': 5.0, 'max_confusion_matrix_size': 20, 'max_hit_ratio_k': 0, 'ntrees': 7, 'max_depth': 15, 'min_rows': 10.0, 'nbins': 20, 'nbins_top_level': 1024, 'nbins_cats': 1024, 'r2_stopping': 1.7976931348623157e+308, 'stopping_rounds': 0, 'stopping_metric': 'AUTO', 'stopping_tolerance': 0.001, 'max_runtime_secs': 0.0, 'seed': 1234, 'build_tree_one_node': False, 'learn_rate': 0.1, 'learn_rate_annealing': 0.1, 'distribution': 'multinomial', 'quantile_alpha': 0.1, 'tweedie_power': 1.9, 'huber_alpha': 0.9, 'checkpoint': None, 'sample_rate': 0.1, 'sample_rate_per_class': None, 'col_sample_rate': 0.3, 'col_sample_rate_change_per_level': 1.3, 'col_sample_rate_per_tree': 0.6, 'min_split_improvement': 1e-05, 'histogram_type': 'UniformAdaptive', 'max_abs_leafnode_pred': 1.7976931348623157e+308, 'pred_noise_bandwidth': 0.0, 'categorical_encoding': 'AUTO', 'calibrate_model': False, 'calibration_frame': None, 'custom_metric_func': None, 'custom_distribution_func': None, 'export_checkpoints_dir': None, 'monotone_constraints': None, 'check_constant_response': True}, 'auc': 0.8110806002475247}}
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2ORandomForestEstimator
hyper_params = {
'mtries': [-1, 30, 60, 90, 150, 200, 350, 500],
'balance_classes': [True, False],
'ntrees': [10, 20, 50, 100, 150],
'max_depth': [5, 10, 15, 20], # defaults to 20
'sample_rate': [ 0.1, 0.3, 0.6, 0.9],
'col_sample_rate_per_tree': [ 0.1, 0.3, 0.6, 0.8, 1],
'col_sample_rate_change_per_level': [ 0.1, 0.3, 0.6, 0.8, 1, 1.3, 1.5, 1.7, 1.9],
'histogram_type': ["AUTO", "UniformAdaptive", "Random", "QuantilesGlobal", "RoundRobin"]
#'score_tree_interval',
#'min_split_improvement',
#'class_sampling_factors',
#'max_after_balance_size',
#'min_rows', # defaults to 1
#'nbins', # default is 20
#'nbins_top_level', # requires too much tuning
#'nbins_cats', # requires too much tuning
#'r2_stopping',
#'seed',
#'build_tree_one_node',
#'sample_rate_per_class':[ 0.1, 0.9],
}
search_criteria = {"strategy": "RandomDiscrete",
"max_models": MAX_GRID_MODELS}
base_model = H2ORandomForestEstimator(
nfolds=nfolds,
fold_assignment = "random",
keep_cross_validation_predictions = True,
seed=1234)
# Train the grid
rf_grid = H2OGridSearch(model=base_model,
hyper_params=hyper_params,
search_criteria=search_criteria,
parallelism=1)
rf_grid.train(x=x, y=y, training_frame=train, validation_frame=test)
drf Grid Build progress: |████████████████████████████████████████████████
h2o.save_grid(MODELS_LOCATION + "drf_grid", rf_grid.grid_id)
--------------------------------------------------------------------------- H2OTypeError Traceback (most recent call last) <ipython-input-18-5e4389eaf4cf> in <module> ----> 1 h2o.save_grid(MODELS_LOCATION + "drf_grid", rf_grid.grid_id) /anaconda/envs/azureml_py36/lib/python3.6/site-packages/h2o/h2o.py in save_grid(grid_directory, grid_id) 543 """ 544 assert_is_type(grid_directory, str) --> 545 assert_is_type(grid_id, str) 546 api("POST /3/Grid.bin/" + grid_id + "/export", {"grid_directory": grid_directory}) 547 return grid_directory + "/" + grid_id /anaconda/envs/azureml_py36/lib/python3.6/site-packages/h2o/utils/typechecks.py in assert_is_type(var, *types, **kwargs) 455 vtn = _get_type_name(type(var)) 456 raise H2OTypeError(var_name=vname, var_value=var, var_type_name=vtn, exp_type_name=etn, message=message, --> 457 skip_frames=skip_frames) 458 459 H2OTypeError: Argument `grid_id` should be a string, got NoneType None
sorted_grid = rf_grid.get_grid(sort_by='auc', decreasing=True)
top_rf_model_ids = sorted_grid.model_ids[:MAX_GRID_MODELS]
top_rf_model_id_params_dict = {}
for mdl_id in top_rf_model_ids:
model = h2o.get_model(mdl_id)
top_rf_model_id_params_dict[mdl_id] = {'params': model.actual_params,
'auc': model.auc(),
}
with open(DATA_LOCATION + 'top_rf_models.json', 'w') as json_file:
json.dump(top_rf_model_id_params_dict, json_file)
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2ODeepLearningEstimator
hyper_params = {
# 'adaptive_rate',
# 'average_activation',
# 'balance_classes',
# 'categorical_encoding',
# 'classification_stop',
# 'class_sampling_factors',
# 'col_major',
# 'elastic_averaging_moving_rate',
# 'elastic_averaging_regularization',
# 'elastic_averaging',
# 'epochs',
# 'epsilon',
# 'fast_mode',
# 'force_load_balance',
# 'initial_biases',
# 'initial_weights',
# 'initial_weight_distribution',
# 'initial_weight_scale',
# 'l1',
# 'l2',
# 'max_after_balance_size',
# 'max_categorical_features',
# 'max_w2',
# 'missing_values_handling',
# 'momentum_ramp',
# 'momentum_stable',
# 'momentum_start',
# 'nesterov_accelerated_gradient',
# 'overwrite_with_best_model',
# 'quantile_alpha',
# 'quiet_mode',
# 'rate_annealing',
# 'rate_decay',
# 'rate',
# 'regression_stop',
# 'replicate_training_data',
# 'reproducible',
# 'score_duty_cycle',
# 'score_interval',
# 'score_training_samples',
# 'score_validation_samples',
# 'score_validation_sampling',
# 'seed',
# 'shuffle_training_data',
# 'single_node_mode',
# 'sparse',
# 'sparsity_beta',
# 'target_ratio_comm_to_comp',
# 'train_samples_per_iteration',
# 'tweedie_power',
# 'use_all_factor_levels',
# 'variable_importances ',
"activation" : ['Rectifier', 'RectifierWithDropout'] ,
'distribution': ['bernoulli', 'multinomial'],
"hidden_dropout_ratios": [0, 0.1, 0.2, [0.5, 0.5], [0.5, 0.5]] ,
"hidden": [[10, 10, 10], [50], [500, 500], [500, 500, 500]] ,
"input_dropout_ratio":[0, 0.10, 0.15, 0.20] ,
"rho" : [0.95, 0.90] ,
"standardize" : [True, False] ,
'loss': ['Absolute', 'Quadratic', 'Huber', 'CrossEntropy'],
}
search_criteria = {"strategy": "RandomDiscrete",
"max_models": MAX_GRID_MODELS}
base_model = H2ODeepLearningEstimator(
keep_cross_validation_predictions = True,
nfolds= nfolds,
fold_assignment = "random",
seed=1234)
# Train the grid
dl_grid = H2OGridSearch(model=base_model,
hyper_params=hyper_params,
search_criteria=search_criteria)
dl_grid.train(x=x, y=y, training_frame=train, validation_frame=test)
h2o.save_grid(MODELS_LOCATION + "dl_grid", dl_grid.grid_id)
sorted_grid = dl_grid.get_grid(sort_by='auc', decreasing=True)
top_dl_model_ids = sorted_grid.model_ids[:MAX_GRID_MODELS]
top_dl_model_id_params_dict = {}
for mdl_id in top_dl_model_ids:
model = h2o.get_model(mdl_id)
top_dl_model_id_params_dict[mdl_id] = {'params': model.actual_params,
'auc': model.auc(),
}
with open(DATA_LOCATION + 'top_dl_models.json', 'w') as json_file:
json.dump(top_dl_model_id_params_dict, json_file)
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2OXGBoostEstimator
hyper_params = {
'distribution': ['bernoulli', 'multinomial'],
'categorical_encoding': ['auto', 'binary', 'label_encoder'],
'ntrees': [10, 50, 70, 100],
'booster': ['gbtree', 'gblinear', 'dart'],
'col_sample_rate': [0.1, 0.3, 0.6, 0.8, 1],
'colsample_bylevel': [0.1, 0.3, 0.6, 0.8, 1],
'colsample_bytree': [0.1, 0.3, 0.6, 0.8, 1],
'learn_rate': [0.1, 0.3, 0.6, 0.8, 1],
'grow_policy': ['depthwise', 'lossguide'],
'max_depth': [0, 3, 6],
'normalize_type': ['tree', 'forest'],
'sample_type': ['uniform', 'weighted'],
'sample_rate': [0.1, 0.3, 0.6, 0.8, 1],
'tree_method': ['auto', 'exact', 'approx', 'hist'],
'tweedie_power': [1.2, 1.5, 1.8],
# 'max_abs_leafnode_pred'
# 'min_split_improvement',
# 'max_bins',
# 'max_delta_step',
# 'max_leaves',
# 'min_rows':
# 'one_drop',
# 'rate_drop',
# 'reg_alpha',
# 'reg_lambda',
# 'skip_drop',
# 'num_leaves'
}
search_criteria = {"strategy": "RandomDiscrete",
"max_models": MAX_GRID_MODELS}
base_model = H2OXGBoostEstimator(
keep_cross_validation_predictions = True,
nfolds= nfolds,
fold_assignment = "random",
seed=1234)
# Train the grid
xgb_grid = H2OGridSearch(model=base_model,
hyper_params=hyper_params,
search_criteria=search_criteria)
xgb_grid.train(x=x, y=y, training_frame=train, validation_frame=test)
xgboost Grid Build progress: |████████████████████████████████████████████| 100%
h2o.save_grid(MODELS_LOCATION + "xgb_grid", xgb_grid.grid_id)
sorted_grid = xgb_grid.get_grid(sort_by='auc', decreasing=True)
top_xgb_model_ids = sorted_grid.model_ids[:MAX_GRID_MODELS]
top_xgb_model_id_params_dict = {}
for mdl_id in top_xgb_model_ids:
model = h2o.get_model(mdl_id)
top_xgb_model_id_params_dict[mdl_id] = {'params': model.actual_params,
'auc': model.auc(),
}
with open(DATA_LOCATION + 'top_xgb_models.json', 'w') as json_file:
json.dump(top_xgb_model_id_params_dict, json_file)