Assign all ETCs in all regions to h3 hexagons, which will be used to limit spatial linkage in the model train/test split.

import h3
import shapely
import pandas as pd
import tobler
import geopandas as gpd
h3_resolution = 7
regions_datadir = "/data/uscuni-eurofab/"
tessellations_dir = '/data/uscuni-eurofab/processed_data/tessellations/'
buildings_dir = '/data/uscuni-eurofab/processed_data/buildings/'

region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "ms_ce_region_hulls.parquet"
    )
region_hulls.shape
(474, 1)
def assign_hexagons(region_id, region_hull):
    '''Assign all ETCs in a reigion to h3 hexagons.'''
    
    ## split region hull into hexagons
    bounds = region_hull.iloc[0]
    poly = h3.geo_to_cells(bounds, res=h3_resolution)
    res = [shapely.geometry.shape(h3.cells_to_geo([p])) for p in poly]
    hexagons = gpd.GeoSeries(res, index=poly,name='geometry', crs='epsg:4326').to_crs(epsg=3035)

    tess = gpd.read_parquet(
            tessellations_dir + f"tessellation_{region_id}.parquet"
    )

    # assign hexagons to tessellation cells
    inp, res = tess.sindex.query(hexagons, predicate='intersects')
    # polygons should be assigned to only one h3 grid
    duplicated = pd.Series(res).duplicated()
    inp = inp[~duplicated]
    res = res[~duplicated]
    
    hex_assignments = pd.Series(hexagons.index[inp].values, tess.index[res], name='hexagons').sort_index()
    return hex_assignments
%%time
for region_id, region_hull in region_hulls.to_crs(epsg=4326).iterrows():
    print(region_id)
    hex_assignments = assign_hexagons(region_id, region_hull)
    hex_assignments.reset_index().to_parquet(f'/data/uscuni-eurofab/processed_data/hexagons/{region_id}_hexagon.pq')
19
24
33
478
754
817
1049
1485
1677
2415
2513
2707
2785
2790
2820
3228
3307
3313
3357
3540
3661
3762
3806
4271
4285
4640
4763
5175
5189
5320
5429
5874
6337
6351
6477
6858
6881
7113
7381
7411
7640
7693
7728
7921
7924
8014
8087
8147
8440
8659
8927
8960
9560
9840
9887
10197
10283
10600
10673
10764
10875
11024
11178
11550
11623
11640
12080
12222
12247
12347
12401
12546
12614
12649
12695
12736
13285
13496
13497
14086
14327
14383
14605
14836
15151
15308
15362
15415
15540
15560
15646
16446
16582
16687
17219
17720
17763
17808
17857
17874
17951
18006
18143
18215
19325
19474
20008
20063
20356
20573
20597
20811
21128
22022
22398
22633
22770
23227
23621
23941
24079
24141
24683
24737
25065
25497
25588
25814
25964
26146
26642
26773
27374
27700
27783
27997
28059
28060
28237
28601
28751
28795
28961
29387
30259
30571
30662
30938
31101
31696
31807
32671
32890
33094
33150
33415
33718
33769
33803
34553
34902
36043
36064
36227
36327
36396
36457
37246
37360
37371
37637
37789
37937
38374
38429
38584
38606
38877
39130
39903
40933
41257
41618
41686
42224
42437
42465
42570
43084
44353
44577
44580
44676
44853
44980
45035
45263
45342
45356
45491
45681
45951
45966
46050
46338
46835
47308
47311
47899
48569
48590
50170
50415
50466
50669
51186
51454
51525
51553
51595
52243
53006
53082
53490
53658
54059
54551
54724
55117
55544
56004
56381
56860
57118
57279
57491
57959
57961
58553
58714
59253
60585
60752
60840
60950
61323
61799
61852
62599
62847
63185
63950
64170
64620
64720
64762
64900
65086
65153
65186
65204
65806
66259
66807
66864
67836
68214
68368
68705
68900
69027
70379
70515
70529
70612
70786
71286
71490
72623
73070
73089
73094
73296
73452
74302
74923
75752
75990
76214
77130
77145
77166
77454
78090
78925
79739
79783
80776
81352
81530
81666
81810
82184
82214
82385
82403
83044
83798
84160
84183
84268
84986
85331
86349
87058
87257
87464
87553
87669
88485
89034
89093
89706
90747
90960
91242
92304
92816
94055
94482
95277
95332
95578
95656
96235
97027
97307
97413
97707
97848
97855
98159
98760
98887
99047
99125
99281
99430
99460
99507
99758
99898
100224
100304
100594
100635
100722
100975
101123
101252
101332
101518
101969
102322
102334
102389
102490
102508
102938
103001
103298
103643
103658
103698
103735
103818
104251
104358
104396
104415
104653
105293
105313
105392
105846
105995
106753
106930
106999
107265
107428
107916
108140
108308
108376
108648
109005
109050
109585
109676
109949
110157
110373
111252
111314
111594
111672
111683
111801
112098
112842
112966
113249
113487
114261
114334
114685
114948
115294
115954
116035
116242
116830
117162
117200
118465
118956
119645
119966
119991
120673
120825
121214
122230
122372
122404
122529
122904
122940
123004
123370
123547
124008
124022
124219
124768
125010
125294
125389
125591
125840
126119
126381
126542
126723
127149
127562
128338
129117
129193
129335
129409
129600
130732
131498
131560
132469
132728
132805
133611
133765
133801
134054
134356
CPU times: user 3min 2s, sys: 46.8 s, total: 3min 49s
Wall time: 3min 41s

Explore assignment

region_id = 65806
hex_assignments = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/hexagons/{region_id}_hexagon.pq').set_index('index')
selected = hex_assignments[hex_assignments.hexagons == '871e354ddffffff'].index
selected.shape
(1021,)
tess = gpd.read_parquet(
            tessellations_dir + f"tessellation_{region_id}.parquet"
    )
tess.loc[selected].explore()