Eurofab model training

import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from core.utils import used_keys

from palettable.colorbrewer.qualitative import Set3_12
from sklearn.metrics import davies_bouldin_score

from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from sklearn import model_selection
from sklearn.metrics import accuracy_score, balanced_accuracy_score, make_scorer

Read the training data, groups and labels

%%time
X_train = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_data4.pq')
y = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_labels4.pq')
groups = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons4.pq')
CPU times: user 56.2 s, sys: 37.9 s, total: 1min 34s
Wall time: 25.7 s
# limit to prague only
X_train = X_train[X_train.index.str.startswith('65806')]
y = y[y.index.str.startswith('65806')]
groups = groups[groups.index.str.startswith('65806')]

Predict only building classification, so drop empty ETCs.

X_train = X_train[~X_train.index.str.split('_').str[-1].str.startswith('-')].fillna(0)
y = y[~y.index.str.split('_').str[-1].str.startswith('-')].fillna(0)
groups = groups[~groups.index.str.split('_').str[-1].str.startswith('-')].fillna(0)

Setup kfold splitter based on spatial contiguity

gkf = model_selection.StratifiedGroupKFold(n_splits=5)
splits = gkf.split(
    X_train.values,
    y.final_without_noise.values,
    groups=groups.hexagons.values,
)

Setup grid search and evaluation pipeline

calibrated_forest = CalibratedClassifierCV(
   estimator=RandomForestClassifier(random_state=123, n_jobs=-1))

param_grid = {
    # 'estimator__max_depth': [2, 4, 6, 8, 20],
    'estimator__max_depth': [2, 4, 6],
    # 'estimator__min_samples_split': [2, 50, 100, 200, 400]
    'estimator__min_samples_split': [200, 400]
}

scoring = {"Balanced Accuracy": make_scorer(balanced_accuracy_score), "Accuracy": make_scorer(accuracy_score)}

search = GridSearchCV(calibrated_forest, param_grid, cv=splits, scoring=scoring, refit="Accuracy", return_train_score=True)

Train and tune the model

%%time
search.fit(
    X_train.values,
    y.final_without_noise.values
)

Select the best model and predict the test data

# best_model=RandomForestClassifier(random_state=123, n_jobs=-1)
# best_model.fit(X_train, y)
best_model = search.best_estimator

KeyboardInterrupt
X_test = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/testing_data4.pq')
y_test = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/testing_labels4.pq')

# # limit to vienna only
X_test = X_test[X_test.index.str.startswith('84986')]
y_test = y_test[y_test.index.str.startswith('84986')]
X_test = X_test[~X_test.index.str.split('_').str[-1].str.startswith('-')].fillna(0)
y_test = y_test[~y_test.index.str.split('_').str[-1].str.startswith('-')].fillna(0)
predictions = best_model.predict(X_test)
accuracy_score(predictions, y_test)
0.29364393026817087

Plot predictions

from lonboard import SolidPolygonLayer, Map
from lonboard.basemap import CartoBasemap
from lonboard.colormap import apply_categorical_cmap
from palettable.colorbrewer.qualitative import Set3_12
from core.cluster_validation import get_color
region_id = 84986 
buildings = gpd.read_parquet(f'/data/uscuni-eurofab/processed_data/buildings/buildings_{region_id}.parquet')
labels = y_test[y_test.index.str.startswith(str(region_id))].final_without_noise
labels.index = labels.index.str.split('_').str[-1].astype(int)
buildings["predicted_label"] = pd.Series(predictions, labels.index)
buildings["true_label"] = labels
layer = SolidPolygonLayer.from_geopandas(
    gdf=buildings[["geometry", "predicted_label", 'true_label']], get_fill_color=get_color(buildings['true_label'].values.astype(int)), opacity=0.15
)
/tmp/ipykernel_871757/1019886908.py:2: RuntimeWarning: invalid value encountered in cast
  gdf=buildings[["geometry", "predicted_label", 'true_label']], get_fill_color=get_color(buildings['true_label'].values.astype(int)), opacity=0.15
/home/krasen/eurofab_morphometrics/.pixi/envs/default/lib/python3.12/site-packages/lonboard/_geoarrow/ops/reproject.py:97: UserWarning: Input being reprojected to EPSG:4326 CRS
  warnings.warn("Input being reprojected to EPSG:4326 CRS")
m = Map(layer, basemap_style=CartoBasemap.Positron)
m