import glob
import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from core.utils import used_keys
from palettable.colorbrewer.qualitative import Set3_12
from sklearn.metrics import davies_bouldin_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from sklearn import model_selection
from sklearn.metrics import accuracy_score, balanced_accuracy_score, make_scorer
Eurofab model training
Read the training data, groups and labels
%%time
= pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_data4.pq')
X_train = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_labels4.pq')
y = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons4.pq') groups
CPU times: user 56.2 s, sys: 37.9 s, total: 1min 34s
Wall time: 25.7 s
# limit to prague only
= X_train[X_train.index.str.startswith('65806')]
X_train = y[y.index.str.startswith('65806')]
y = groups[groups.index.str.startswith('65806')]
groups
Predict only building classification, so drop empty ETCs.
= X_train[~X_train.index.str.split('_').str[-1].str.startswith('-')].fillna(0)
X_train = y[~y.index.str.split('_').str[-1].str.startswith('-')].fillna(0)
y = groups[~groups.index.str.split('_').str[-1].str.startswith('-')].fillna(0) groups
Setup kfold splitter based on spatial contiguity
= model_selection.StratifiedGroupKFold(n_splits=5)
gkf = gkf.split(
splits
X_train.values,
y.final_without_noise.values,=groups.hexagons.values,
groups )
Setup grid search and evaluation pipeline
= CalibratedClassifierCV(
calibrated_forest =RandomForestClassifier(random_state=123, n_jobs=-1))
estimator
= {
param_grid # 'estimator__max_depth': [2, 4, 6, 8, 20],
'estimator__max_depth': [2, 4, 6],
# 'estimator__min_samples_split': [2, 50, 100, 200, 400]
'estimator__min_samples_split': [200, 400]
}
= {"Balanced Accuracy": make_scorer(balanced_accuracy_score), "Accuracy": make_scorer(accuracy_score)}
scoring
= GridSearchCV(calibrated_forest, param_grid, cv=splits, scoring=scoring, refit="Accuracy", return_train_score=True) search
Train and tune the model
%%time
search.fit(
X_train.values,
y.final_without_noise.values )
Select the best model and predict the test data
# best_model=RandomForestClassifier(random_state=123, n_jobs=-1)
# best_model.fit(X_train, y)
= search.best_estimator best_model
KeyboardInterrupt
= pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/testing_data4.pq')
X_test = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/testing_labels4.pq')
y_test
# # limit to vienna only
= X_test[X_test.index.str.startswith('84986')]
X_test = y_test[y_test.index.str.startswith('84986')] y_test
= X_test[~X_test.index.str.split('_').str[-1].str.startswith('-')].fillna(0)
X_test = y_test[~y_test.index.str.split('_').str[-1].str.startswith('-')].fillna(0) y_test
= best_model.predict(X_test) predictions
accuracy_score(predictions, y_test)
0.29364393026817087
Plot predictions
from lonboard import SolidPolygonLayer, Map
from lonboard.basemap import CartoBasemap
from lonboard.colormap import apply_categorical_cmap
from palettable.colorbrewer.qualitative import Set3_12
from core.cluster_validation import get_color
= 84986 region_id
= gpd.read_parquet(f'/data/uscuni-eurofab/processed_data/buildings/buildings_{region_id}.parquet') buildings
= y_test[y_test.index.str.startswith(str(region_id))].final_without_noise
labels = labels.index.str.split('_').str[-1].astype(int) labels.index
"predicted_label"] = pd.Series(predictions, labels.index)
buildings["true_label"] = labels buildings[
= SolidPolygonLayer.from_geopandas(
layer =buildings[["geometry", "predicted_label", 'true_label']], get_fill_color=get_color(buildings['true_label'].values.astype(int)), opacity=0.15
gdf )
/tmp/ipykernel_871757/1019886908.py:2: RuntimeWarning: invalid value encountered in cast
gdf=buildings[["geometry", "predicted_label", 'true_label']], get_fill_color=get_color(buildings['true_label'].values.astype(int)), opacity=0.15
/home/krasen/eurofab_morphometrics/.pixi/envs/default/lib/python3.12/site-packages/lonboard/_geoarrow/ops/reproject.py:97: UserWarning: Input being reprojected to EPSG:4326 CRS
warnings.warn("Input being reprojected to EPSG:4326 CRS")
= Map(layer, basemap_style=CartoBasemap.Positron)
m m