import glob
import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from core.utils import used_keys
from palettable.colorbrewer.qualitative import Set3_12
from sklearn.metrics import davies_bouldin_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from sklearn import model_selection
from sklearn.metrics import accuracy_score, balanced_accuracy_score, make_scorer
from core.generate_predictions import read_train_test, get_cluster_names, get_level_cut
Eurofab model training
= 'v3'
v
= 600_000
sample_size
= 4
mapping_level
= 7 train_test_iteration
Generate the train test data for the specified iteration and train the model
%%time
= read_train_test(train_test_iteration, mapping_level, sample_size)
X_resampled, y_resampled
from sklearn.ensemble import HistGradientBoostingClassifier
= HistGradientBoostingClassifier(random_state=123, verbose=1,
model = 0.03,
learning_rate = None,
max_depth = 120,
max_iter =None,
max_leaf_nodes=.5
max_features
)
model.fit(X_resampled, y_resampled)print(model.score(X_resampled, y_resampled))
final_without_noise
6 600000
1 600000
7 600000
8 600000
14 600000
2 600000
15 600000
13 600000
5 600000
16 600000
9 600000
12 600000
11 600000
Name: count, dtype: int64
Binning 8.087 GB of training data: 3.625 s
Binning 0.899 GB of validation data: 0.393 s
Fitting gradient boosted rounds:
[1/120] 13 trees, 893629 leaves (68740 on avg), max depth = 59, train loss: 2.32222, val loss: 2.36821, in 129.254s
[2/120] 13 trees, 3523422 leaves (271032 on avg), max depth = 157, train loss: 2.13320, val loss: 2.21806, in 366.993s
[3/120] 13 trees, 3553717 leaves (273362 on avg), max depth = 152, train loss: 1.97650, val loss: 2.09519, in 373.911s
[4/120] 13 trees, 3552116 leaves (273239 on avg), max depth = 176, train loss: 1.84211, val loss: 1.99092, in 377.135s
[5/120] 13 trees, 3548134 leaves (272933 on avg), max depth = 151, train loss: 1.72445, val loss: 1.90045, in 376.539s
[6/120] 13 trees, 3547731 leaves (272902 on avg), max depth = 164, train loss: 1.61958, val loss: 1.82023, in 377.085s
[7/120] 13 trees, 3547447 leaves (272880 on avg), max depth = 147, train loss: 1.52518, val loss: 1.74856, in 378.116s
[8/120] 13 trees, 3546411 leaves (272800 on avg), max depth = 148, train loss: 1.43953, val loss: 1.68391, in 378.931s
[9/120] 13 trees, 3545132 leaves (272702 on avg), max depth = 144, train loss: 1.36107, val loss: 1.62483, in 378.053s
[10/120] 13 trees, 3545586 leaves (272737 on avg), max depth = 152, train loss: 1.28891, val loss: 1.57076, in 377.779s
[11/120] 13 trees, 3544338 leaves (272641 on avg), max depth = 145, train loss: 1.22210, val loss: 1.52069, in 378.350s
[12/120] 13 trees, 3544490 leaves (272653 on avg), max depth = 145, train loss: 1.16011, val loss: 1.47441, in 380.143s
[13/120] 13 trees, 3545625 leaves (272740 on avg), max depth = 129, train loss: 1.10232, val loss: 1.43133, in 380.879s
[14/120] 13 trees, 3544251 leaves (272634 on avg), max depth = 146, train loss: 1.04834, val loss: 1.39104, in 381.777s
[15/120] 13 trees, 3544744 leaves (272672 on avg), max depth = 146, train loss: 0.99778, val loss: 1.35346, in 381.633s
[16/120] 13 trees, 3544013 leaves (272616 on avg), max depth = 140, train loss: 0.95035, val loss: 1.31827, in 381.941s
[17/120] 13 trees, 3545107 leaves (272700 on avg), max depth = 131, train loss: 0.90571, val loss: 1.28509, in 380.251s
[18/120] 13 trees, 3545243 leaves (272711 on avg), max depth = 136, train loss: 0.86369, val loss: 1.25399, in 383.591s
[19/120] 13 trees, 3545469 leaves (272728 on avg), max depth = 118, train loss: 0.82398, val loss: 1.22455, in 384.120s
[20/120] 13 trees, 3545363 leaves (272720 on avg), max depth = 166, train loss: 0.78651, val loss: 1.19675, in 385.436s
[21/120] 13 trees, 3546150 leaves (272780 on avg), max depth = 127, train loss: 0.75105, val loss: 1.17043, in 385.135s
[22/120] 13 trees, 3545906 leaves (272762 on avg), max depth = 125, train loss: 0.71745, val loss: 1.14549, in 384.697s
[23/120] 13 trees, 3546285 leaves (272791 on avg), max depth = 121, train loss: 0.68560, val loss: 1.12183, in 384.964s
[24/120] 13 trees, 3545721 leaves (272747 on avg), max depth = 137, train loss: 0.65540, val loss: 1.09940, in 386.259s
[25/120] 13 trees, 3547225 leaves (272863 on avg), max depth = 131, train loss: 0.62670, val loss: 1.07798, in 386.087s
[26/120] 13 trees, 3546474 leaves (272805 on avg), max depth = 127, train loss: 0.59944, val loss: 1.05761, in 387.003s
[27/120] 13 trees, 3546956 leaves (272842 on avg), max depth = 126, train loss: 0.57350, val loss: 1.03825, in 386.453s
[28/120] 13 trees, 3546961 leaves (272843 on avg), max depth = 119, train loss: 0.54881, val loss: 1.01974, in 388.014s
[29/120] 13 trees, 3548322 leaves (272947 on avg), max depth = 131, train loss: 0.52532, val loss: 1.00217, in 388.306s
[30/120] 13 trees, 3547040 leaves (272849 on avg), max depth = 132, train loss: 0.50294, val loss: 0.98530, in 386.288s
[31/120] 13 trees, 3548367 leaves (272951 on avg), max depth = 118, train loss: 0.48159, val loss: 0.96917, in 388.236s
[32/120] 13 trees, 3549004 leaves (273000 on avg), max depth = 127, train loss: 0.46125, val loss: 0.95380, in 388.696s
[33/120] 13 trees, 3549247 leaves (273019 on avg), max depth = 131, train loss: 0.44185, val loss: 0.93913, in 387.675s
[34/120] 13 trees, 3548727 leaves (272979 on avg), max depth = 136, train loss: 0.42334, val loss: 0.92506, in 389.091s
[35/120] 13 trees, 3549713 leaves (273054 on avg), max depth = 125, train loss: 0.40565, val loss: 0.91154, in 389.230s
[36/120] 13 trees, 3549362 leaves (273027 on avg), max depth = 121, train loss: 0.38876, val loss: 0.89856, in 387.581s
[37/120] 13 trees, 3549728 leaves (273056 on avg), max depth = 114, train loss: 0.37265, val loss: 0.88617, in 391.615s
[38/120] 13 trees, 3550534 leaves (273118 on avg), max depth = 126, train loss: 0.35725, val loss: 0.87429, in 392.228s
[39/120] 13 trees, 3550237 leaves (273095 on avg), max depth = 134, train loss: 0.34254, val loss: 0.86290, in 392.463s
[40/120] 13 trees, 3550816 leaves (273139 on avg), max depth = 113, train loss: 0.32848, val loss: 0.85204, in 391.219s
[41/120] 13 trees, 3550844 leaves (273141 on avg), max depth = 118, train loss: 0.31504, val loss: 0.84155, in 393.043s
[42/120] 13 trees, 3551307 leaves (273177 on avg), max depth = 125, train loss: 0.30218, val loss: 0.83148, in 390.421s
[43/120] 13 trees, 3550584 leaves (273121 on avg), max depth = 118, train loss: 0.28987, val loss: 0.82175, in 396.341s
[44/120] 13 trees, 3551836 leaves (273218 on avg), max depth = 122, train loss: 0.27809, val loss: 0.81238, in 397.013s
[45/120] 13 trees, 3551863 leaves (273220 on avg), max depth = 118, train loss: 0.26681, val loss: 0.80339, in 394.360s
[46/120] 13 trees, 3551294 leaves (273176 on avg), max depth = 117, train loss: 0.25604, val loss: 0.79481, in 395.175s
[47/120] 13 trees, 3551400 leaves (273184 on avg), max depth = 116, train loss: 0.24573, val loss: 0.78656, in 397.259s
[48/120] 13 trees, 3551399 leaves (273184 on avg), max depth = 116, train loss: 0.23586, val loss: 0.77865, in 399.250s
[49/120] 13 trees, 3552174 leaves (273244 on avg), max depth = 112, train loss: 0.22641, val loss: 0.77098, in 396.697s
[50/120] 13 trees, 3551382 leaves (273183 on avg), max depth = 113, train loss: 0.21734, val loss: 0.76362, in 396.233s
[51/120] 13 trees, 3552454 leaves (273265 on avg), max depth = 112, train loss: 0.20867, val loss: 0.75651, in 393.356s
[52/120] 13 trees, 3552122 leaves (273240 on avg), max depth = 116, train loss: 0.20037, val loss: 0.74967, in 394.411s
[53/120] 13 trees, 3552395 leaves (273261 on avg), max depth = 112, train loss: 0.19241, val loss: 0.74307, in 395.038s
[54/120] 13 trees, 3553077 leaves (273313 on avg), max depth = 119, train loss: 0.18478, val loss: 0.73671, in 395.145s
[55/120] 13 trees, 3553441 leaves (273341 on avg), max depth = 112, train loss: 0.17747, val loss: 0.73058, in 396.733s
[56/120] 13 trees, 3553053 leaves (273311 on avg), max depth = 122, train loss: 0.17046, val loss: 0.72464, in 396.646s
[57/120] 13 trees, 3552633 leaves (273279 on avg), max depth = 117, train loss: 0.16375, val loss: 0.71895, in 397.845s
[58/120] 13 trees, 3554456 leaves (273419 on avg), max depth = 115, train loss: 0.15731, val loss: 0.71345, in 397.741s
[59/120] 13 trees, 3553810 leaves (273370 on avg), max depth = 126, train loss: 0.15114, val loss: 0.70817, in 396.939s
[60/120] 13 trees, 3553886 leaves (273375 on avg), max depth = 112, train loss: 0.14523, val loss: 0.70306, in 396.179s
[61/120] 13 trees, 3553906 leaves (273377 on avg), max depth = 109, train loss: 0.13955, val loss: 0.69814, in 395.085s
[62/120] 13 trees, 3554587 leaves (273429 on avg), max depth = 112, train loss: 0.13411, val loss: 0.69335, in 398.457s
[63/120] 13 trees, 3553701 leaves (273361 on avg), max depth = 125, train loss: 0.12889, val loss: 0.68878, in 401.568s
[64/120] 13 trees, 3554137 leaves (273395 on avg), max depth = 122, train loss: 0.12388, val loss: 0.68431, in 402.053s
[65/120] 13 trees, 3553766 leaves (273366 on avg), max depth = 113, train loss: 0.11908, val loss: 0.68003, in 401.556s
[66/120] 13 trees, 3554696 leaves (273438 on avg), max depth = 122, train loss: 0.11447, val loss: 0.67587, in 401.360s
[67/120] 13 trees, 3554903 leaves (273454 on avg), max depth = 109, train loss: 0.11004, val loss: 0.67183, in 402.173s
[68/120] 13 trees, 3554957 leaves (273458 on avg), max depth = 109, train loss: 0.10580, val loss: 0.66801, in 402.843s
[69/120] 13 trees, 3554710 leaves (273439 on avg), max depth = 114, train loss: 0.10173, val loss: 0.66433, in 403.002s
[70/120] 13 trees, 3554676 leaves (273436 on avg), max depth = 108, train loss: 0.09782, val loss: 0.66072, in 403.448s
[71/120] 13 trees, 3554488 leaves (273422 on avg), max depth = 109, train loss: 0.09407, val loss: 0.65724, in 403.679s
[72/120] 13 trees, 3554866 leaves (273451 on avg), max depth = 113, train loss: 0.09047, val loss: 0.65385, in 404.623s
[73/120] 13 trees, 3555466 leaves (273497 on avg), max depth = 113, train loss: 0.08701, val loss: 0.65062, in 402.190s
[74/120] 13 trees, 3554944 leaves (273457 on avg), max depth = 129, train loss: 0.08370, val loss: 0.64749, in 403.485s
[75/120] 13 trees, 3554835 leaves (273448 on avg), max depth = 116, train loss: 0.08051, val loss: 0.64447, in 404.000s
[76/120] 13 trees, 3554684 leaves (273437 on avg), max depth = 115, train loss: 0.07745, val loss: 0.64156, in 402.361s
[77/120] 13 trees, 3554047 leaves (273388 on avg), max depth = 112, train loss: 0.07451, val loss: 0.63871, in 401.778s
[78/120] 13 trees, 3555476 leaves (273498 on avg), max depth = 118, train loss: 0.07168, val loss: 0.63592, in 405.132s
[79/120] 13 trees, 3556083 leaves (273544 on avg), max depth = 109, train loss: 0.06897, val loss: 0.63330, in 403.025s
[80/120] 13 trees, 3555077 leaves (273467 on avg), max depth = 115, train loss: 0.06637, val loss: 0.63078, in 405.793s
[81/120] 13 trees, 3555728 leaves (273517 on avg), max depth = 120, train loss: 0.06386, val loss: 0.62834, in 404.846s
[82/120] 13 trees, 3555683 leaves (273514 on avg), max depth = 116, train loss: 0.06146, val loss: 0.62600, in 403.964s
[83/120] 13 trees, 3555347 leaves (273488 on avg), max depth = 107, train loss: 0.05916, val loss: 0.62372, in 403.931s
[84/120] 13 trees, 3555362 leaves (273489 on avg), max depth = 113, train loss: 0.05694, val loss: 0.62152, in 403.907s
[85/120] 13 trees, 3555445 leaves (273495 on avg), max depth = 114, train loss: 0.05481, val loss: 0.61940, in 403.666s
[86/120] 13 trees, 3555369 leaves (273489 on avg), max depth = 107, train loss: 0.05276, val loss: 0.61734, in 405.019s
[87/120] 13 trees, 3555200 leaves (273476 on avg), max depth = 109, train loss: 0.05079, val loss: 0.61536, in 404.703s
[88/120] 13 trees, 3555591 leaves (273507 on avg), max depth = 110, train loss: 0.04890, val loss: 0.61348, in 405.277s
[89/120] 13 trees, 3555534 leaves (273502 on avg), max depth = 116, train loss: 0.04708, val loss: 0.61166, in 405.436s
[90/120] 13 trees, 3555090 leaves (273468 on avg), max depth = 121, train loss: 0.04533, val loss: 0.60990, in 405.089s
[91/120] 13 trees, 3556039 leaves (273541 on avg), max depth = 111, train loss: 0.04365, val loss: 0.60817, in 406.238s
[92/120] 13 trees, 3555579 leaves (273506 on avg), max depth = 119, train loss: 0.04203, val loss: 0.60652, in 404.628s
[93/120] 13 trees, 3555924 leaves (273532 on avg), max depth = 107, train loss: 0.04048, val loss: 0.60491, in 406.020s
[94/120] 13 trees, 3555315 leaves (273485 on avg), max depth = 120, train loss: 0.03898, val loss: 0.60331, in 406.914s
[95/120] 13 trees, 3556460 leaves (273573 on avg), max depth = 105, train loss: 0.03754, val loss: 0.60183, in 408.578s
[96/120] 13 trees, 3555297 leaves (273484 on avg), max depth = 114, train loss: 0.03616, val loss: 0.60038, in 408.850s
[97/120] 13 trees, 3555891 leaves (273530 on avg), max depth = 113, train loss: 0.03483, val loss: 0.59898, in 409.125s
[98/120] 13 trees, 3555591 leaves (273507 on avg), max depth = 112, train loss: 0.03355, val loss: 0.59763, in 407.951s
[99/120] 13 trees, 3555537 leaves (273502 on avg), max depth = 105, train loss: 0.03232, val loss: 0.59637, in 406.495s
[100/120] 13 trees, 3555783 leaves (273521 on avg), max depth = 110, train loss: 0.03114, val loss: 0.59514, in 407.627s
[101/120] 13 trees, 3556060 leaves (273543 on avg), max depth = 108, train loss: 0.03000, val loss: 0.59393, in 405.682s
[102/120] 13 trees, 3556198 leaves (273553 on avg), max depth = 105, train loss: 0.02890, val loss: 0.59280, in 407.439s
[103/120] 13 trees, 3556003 leaves (273538 on avg), max depth = 114, train loss: 0.02785, val loss: 0.59166, in 406.388s
[104/120] 13 trees, 3555528 leaves (273502 on avg), max depth = 108, train loss: 0.02684, val loss: 0.59064, in 407.533s
[105/120] 13 trees, 3556125 leaves (273548 on avg), max depth = 103, train loss: 0.02586, val loss: 0.58962, in 407.553s
[106/120] 13 trees, 3554873 leaves (273451 on avg), max depth = 111, train loss: 0.02492, val loss: 0.58868, in 407.946s
[107/120] 13 trees, 3555562 leaves (273504 on avg), max depth = 105, train loss: 0.02402, val loss: 0.58776, in 407.907s
[108/120] 13 trees, 3555250 leaves (273480 on avg), max depth = 98, train loss: 0.02315, val loss: 0.58688, in 405.583s
[109/120] 13 trees, 3555734 leaves (273518 on avg), max depth = 111, train loss: 0.02232, val loss: 0.58608, in 406.225s
[110/120] 13 trees, 3555938 leaves (273533 on avg), max depth = 105, train loss: 0.02151, val loss: 0.58526, in 408.091s
[111/120] 13 trees, 3555400 leaves (273492 on avg), max depth = 99, train loss: 0.02074, val loss: 0.58448, in 406.325s
[112/120] 13 trees, 3556474 leaves (273574 on avg), max depth = 98, train loss: 0.01999, val loss: 0.58371, in 408.967s
[113/120] 13 trees, 3555713 leaves (273516 on avg), max depth = 97, train loss: 0.01927, val loss: 0.58300, in 408.737s
[114/120] 13 trees, 3555699 leaves (273515 on avg), max depth = 109, train loss: 0.01858, val loss: 0.58236, in 409.570s
[115/120] 13 trees, 3554598 leaves (273430 on avg), max depth = 107, train loss: 0.01791, val loss: 0.58171, in 408.524s
[116/120] 13 trees, 3554971 leaves (273459 on avg), max depth = 94, train loss: 0.01727, val loss: 0.58107, in 406.443s
[117/120] 13 trees, 3555144 leaves (273472 on avg), max depth = 98, train loss: 0.01665, val loss: 0.58051, in 406.198s
[118/120] 13 trees, 3555954 leaves (273534 on avg), max depth = 101, train loss: 0.01606, val loss: 0.57994, in 406.120s
[119/120] 13 trees, 3554978 leaves (273459 on avg), max depth = 99, train loss: 0.01548, val loss: 0.57945, in 405.640s
[120/120] 13 trees, 3555301 leaves (273484 on avg), max depth = 94, train loss: 0.01493, val loss: 0.57896, in 405.478s
Fit 1560 trees in 47338.352 s, (423577426 total leaves)
Time spent computing histograms: 27154.602s
Time spent finding best splits: 6091.597s
Time spent applying splits: 8232.458s
Time spent predicting: 205.419s
0.9806842307692307
CPU times: user 5d 15h 9min 8s, sys: 3h 17min 18s, total: 5d 18h 26min 27s
Wall time: 13h 21min 7s
Store the model to disk
%%time
from pickle import dump
with open(f"../data/model_{mapping_level}_{train_test_iteration}.pkl", "wb") as f:
=5) dump(model, f, protocol
CPU times: user 44.9 ms, sys: 11.9 s, total: 11.9 s
Wall time: 12.6 s
Read the test data and measure the model performance
= '/data/uscuni-eurofab-overture/' coredir
= get_level_cut(mapping_level)
level_cut = pd.read_parquet(f'{coredir}processed_data/train_test_data/testing_data{train_test_iteration}.pq')
X_test = pd.read_parquet(f'{coredir}processed_data/train_test_data/testing_labels{train_test_iteration}.pq')
y_test 'final_without_noise'] = y_test['final_without_noise'].map(level_cut.to_dict())
y_test[
= get_cluster_names(mapping_level)
cluster_names
assert y_test.final_without_noise.isna().sum() == 0
assert (X_test.index == y_test.index).all()
print(y_test.final_without_noise.map(cluster_names).value_counts())
if 'source' in X_test.columns:
# we can do this because of random forest splitting
= pd.Series(np.arange(len(source_factorizer[1])), source_factorizer[1].values, ).to_dict()
factorizer_dict 'source'] = X_test['source'].map(factorizer_dict)
X_test[
## predictions
= model.predict(X_test)
predictions
= f1_score(y_test, predictions, average='weighted')
weighted = f1_score(y_test, predictions, average='micro')
micro = f1_score(y_test, predictions, average='macro')
macro = pd.Series([weighted, micro, macro], index=['Weighted F1', 'Micro F1', 'Macro F1'])
overall_acc
= f1_score(y_test, predictions, average=None)
f1s_vals = pd.Series(
f1s
f1s_vals,= [cluster_names[k] for k in sorted(np.unique(predictions))]
index
)= f1s.sort_values() f1s
final_without_noise
Aligned Winding Streets 1911488
Sparse Open Layout 1901302
Dense Connected Developments 1843043
Compact Development 1646066
Sparse Road-Linked Development 1377693
Cul-de-Sac Layout 1362516
Dense Standalone Buildings 1073182
Sparse Rural Development 643288
Linear Development 464712
Large Interconnected Blocks 218424
Extensive Wide-Spaced Developments 186941
Large Utilitarian Development 178777
Dispersed Linear Development 167466
Name: count, dtype: int64
overall_acc
Weighted F1 0.545135
Micro F1 0.543666
Macro F1 0.542424
dtype: float64
f1s.sort_index()
Aligned Winding Streets 0.455454
Compact Development 0.548607
Cul-de-Sac Layout 0.512535
Dense Connected Developments 0.584578
Dense Standalone Buildings 0.565387
Dispersed Linear Development 0.649112
Extensive Wide-Spaced Developments 0.363151
Large Interconnected Blocks 0.601881
Large Utilitarian Development 0.435066
Linear Development 0.528131
Sparse Open Layout 0.607502
Sparse Road-Linked Development 0.488297
Sparse Rural Development 0.711807
dtype: float64
Generate a confusion matrix.
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
= ['Dense Connected Developments',
labels 'Large Interconnected Blocks',
'Dense Standalone Buildings',
'Compact Development',
'Cul-de-Sac Layout',
'Aligned Winding Streets',
'Sparse Rural Development',
'Dispersed Linear Development',
'Linear Development',
'Sparse Open Layout',
'Sparse Road-Linked Development',
'Large Utilitarian Development',
'Extensive Wide-Spaced Developments']
= confusion_matrix(y_test.final_without_noise.map(cluster_names),
cm map(cluster_names),
pd.Series(predictions).=labels,
labels='true'
normalize )
= ConfusionMatrixDisplay(confusion_matrix=np.round(cm, decimals=2),
disp =labels
display_labels )
= plt.subplots(figsize=(12,12), dpi=300)
fig,ax = 'vertical', ax=ax) disp.plot( xticks_rotation
Plot predictions
import pandas as pd
import geopandas as gpd
import pickle
from lonboard import SolidPolygonLayer, Map
from lonboard.basemap import CartoBasemap
from lonboard.colormap import apply_categorical_cmap
from palettable.colorbrewer.qualitative import Set3_12
from core.cluster_validation import get_color
from core.utils import combine_regions_data
from core.generate_predictions import read_train_test, get_cluster_names, get_level_cut
= "/data/uscuni-eurofab-overture/"
regions_datadir = gpd.read_parquet(
region_hulls + "regions/" + "ov_ce_region_hulls.parquet"
regions_datadir
) region_hulls.explore()
Specify model parameters.
= '/data/uscuni-eurofab-overture/'
coredir = 4
mapping_level = 7 train_test_iteration
Read model and test data
with open(f'../data/model_{mapping_level}_{train_test_iteration}.pkl', 'rb') as f:
= pickle.load(f) model
= get_level_cut(mapping_level)
level_cut = pd.read_parquet(f'{coredir}processed_data/train_test_data/testing_labels{train_test_iteration}.pq')
y_test 'final_without_noise'] = y_test['final_without_noise'].map(level_cut.to_dict())
y_test[
= get_cluster_names(mapping_level) cluster_names
Generate predictions for specific regions.
def get_buildings_and_predictions(region_id):
""" for a specific reigon, generate the data for the predictions and index the test data"""
= combine_regions_data([region_id], char_type='only_unweighted_lag_overture')
all_data, all_labels, _ = all_data.index.str.split('_').str[-1].astype(int)
predicted_index = all_labels.index.isin(y_test.index)
region_test_labels
# predict the whole region
= model.predict(all_data)
regional_predictions # map the original predictions and whether the test cases are accurate
= y_test.index.isin(all_labels.index)
region_test_labels # accurate = predictions[region_test_labels] == y_test[region_test_labels].final_without_noise
# accurate.index = accurate.index.str.split('_').str[-1].astype(int)
# read the building polygons and assign the new predictions
= gpd.read_parquet(f'/data/uscuni-eurofab-overture/processed_data/buildings/buildings_{region_id}.parquet')
buildings = buildings.loc[predicted_index]
buildings 'prediction'] = regional_predictions.astype(int)
buildings['in_testset'] = all_labels.index.isin(y_test.index)
buildings[# buildings['accurate'] = accurate
return buildings
%%time
= get_buildings_and_predictions(84568)
vienna_buildings = get_buildings_and_predictions(90996)
brno_buildings = get_buildings_and_predictions(36769)
munich_buildings = get_buildings_and_predictions(91484) wroclaw_buildings
CPU times: user 17min 48s, sys: 10.9 s, total: 17min 59s
Wall time: 1min 55s
Plot the prediction results for the four regions on one figure.
import shapely
import contextily as cx
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex
from matplotlib import colormaps
from matplotlib_scalebar.scalebar import ScaleBar
import matplotlib.patches as mpatches
= [to_hex(c) for c in colormaps.get_cmap('tab20b').colors] cmap
def plot_predictions(buildings, ax, cmap, name=''):
"""Plot the buildings with a specifc colormap on a specifc axes."""
= shapely.box(*shapely.Point(buildings.centroid.x.mean(), buildings.centroid.y.mean()).buffer(10_000).bounds)
buildings_centre = buildings[buildings.intersects(buildings_centre)]
to_plot
for cluster in to_plot.prediction.unique():
== cluster].plot(ax=ax, color=cmap[cluster])
to_plot[to_plot.prediction
ax.set_axis_off()
ax.set_title(name)
=buildings.crs, source=cx.providers.CartoDB.PositronNoLabels)
cx.add_basemap(ax, crs1, location="upper right"))
ax.add_artist(ScaleBar(
return ax
= plt.subplots(2, 2, figsize=(16,16), dpi=300)
fig, ax
= plot_predictions(vienna_buildings, ax[0][0], cmap, 'Vienna')
h1 = plot_predictions(munich_buildings, ax[0][1], cmap, 'Munich')
h2 = plot_predictions(wroclaw_buildings, ax[1][0], cmap, 'Wroclaw')
h3 = plot_predictions(brno_buildings, ax[1][1], cmap, 'Brno')
h4
= [mpatches.Patch(color=cmap[cluster], label=cluster_names[str(cluster)]) for cluster in range(1, 17)]
legend_patches =legend_patches, loc='lower right') fig.legend(handles
Interactive plot for a specific region
= wroclaw_buildings plot_buildings
= SolidPolygonLayer.from_geopandas(
layer =plot_buildings, opacity=0.15
gdf )
/home/krasen/morphometrics/.pixi/envs/default/lib/python3.12/site-packages/lonboard/_geoarrow/ops/reproject.py:97: UserWarning: Input being reprojected to EPSG:4326 CRS
warnings.warn("Input being reprojected to EPSG:4326 CRS")
'prediction'].values.astype(int)) np.unique(plot_buildings[
array([ 1, 2, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16])
= Map(layer, basemap_style=CartoBasemap.DarkMatter) m
from sidecar import Sidecar
= Sidecar(title='buildings')
sc with sc:
display(m)
= get_color(np.arange(0, 20)) final_colors
= final_colors[plot_buildings['prediction'].astype(int)] layer.get_fill_color
= final_colors[plot_buildings['accurate'].astype(int)] layer.get_fill_color