Skip to content

Commit 0d5653b

Browse files
authored
Merge pull request #80 - Add dataframe/huggingface specific classes
feat(loader): Add dataframe/huggingface specific classes
2 parents 1690ec8 + 311fdf5 commit 0d5653b

File tree

44 files changed

+958
-856
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+958
-856
lines changed

docs/CONTRIBUTING.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -177,20 +177,21 @@ Pre-commit hooks enforce standards by running checks (like `ruff`) before commit
177177
Loaders pull data (e.g., CSV, Shapefiles) into a `GeoDataFrame`{ title="A GeoDataFrame is a pandas DataFrame with geospatial capabilities." }.
178178

179179
1. **Subclass `LoaderBase`** (`urban_mapper/modules/loader/abc_loader.py`):
180-
- Implement `load_data_from_file`. Refer to the `base` class for details.
180+
- Implement `_load`. Refer to the `base` class for details.
181+
Note: If it works from a file, consider being subclass of `FileLoaderBase`** (`urban_mapper/modules/loader/file_loader.py`):
181182
2. **Register It**:
182-
- Add to `FILE_LOADER_FACTORY` in `urban_mapper/modules/loader/loader_factory.py`.
183+
- Add to `LOADER_FACTORY` in `urban_mapper/modules/loader/loader_factory.py`.
183184

184185
**Example** (`csv_loader.py`):
185186
```python
186-
from urban_mapper.modules.loader.abc_loader import LoaderBase
187+
from urban_mapper.modules.loader.file_loader import FileLoaderBase
187188
import geopandas as gpd
188189
import pandas as pd
189190
from beartype import beartype
190191

191192
@beartype
192-
class CSVLoader(LoaderBase):
193-
def load_data_from_file(self) -> gpd.GeoDataFrame:
193+
class CSVLoader(FileLoaderBase):
194+
def _load(self) -> gpd.GeoDataFrame:
194195
df = pd.read_csv(self.file_path) #(1)
195196
# Convert to GeoDataFrame...
196197
return gdf

docs/api/loaders.md

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,31 +23,53 @@
2323
options:
2424
heading: "LoaderBase"
2525
members:
26-
- load_data_from_file
27-
- _load_data_from_file
26+
- load
27+
- _load
2828
- preview
2929

30+
## ::: urban_mapper.modules.loader.FileLoaderBase
31+
options:
32+
heading: "FileLoaderBase"
33+
members:
34+
- load
35+
- _load
36+
- preview
37+
3038
## ::: urban_mapper.modules.loader.CSVLoader
3139
options:
3240
heading: "CSVLoader"
3341
members:
34-
- _load_data_from_file
42+
- _load
3543
- preview
3644

3745
## ::: urban_mapper.modules.loader.ParquetLoader
3846
options:
3947
heading: "ParquetLoader"
4048
members:
41-
- _load_data_from_file
49+
- _load
4250
- preview
4351

4452
## ::: urban_mapper.modules.loader.ShapefileLoader
4553
options:
4654
heading: "ShapefileLoader"
4755
members:
48-
- _load_data_from_file
56+
- _load
4957
- preview
5058

59+
## ::: urban_mapper.modules.loader.DataFrameLoader
60+
options:
61+
heading: "DataFrameLoader"
62+
members:
63+
- _load
64+
- preview
65+
66+
## ::: urban_mapper.modules.loader.HuggingFaceLoader
67+
options:
68+
heading: "HuggingFaceLoader"
69+
members:
70+
- _load
71+
- preview
72+
5173
## ::: urban_mapper.modules.loader.LoaderFactory
5274
options:
5375
heading: "LoaderFactory"

docs/copy_of_examples/1-Per-Module/7-urban_pipeline.ipynb

Lines changed: 6 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -26,26 +26,6 @@
2626
"mapper = um.UrbanMapper()"
2727
]
2828
},
29-
{
30-
"cell_type": "code",
31-
"execution_count": null,
32-
"metadata": {},
33-
"outputs": [],
34-
"source": [
35-
"# Note: For the documentation interactive mode, we only query 100000 records from the dataset. Feel free to remove for a more realistic analysis.\n",
36-
"data = (\n",
37-
" um.UrbanMapper()\n",
38-
" .loader\n",
39-
" .from_huggingface(\"oscur/pluto\", number_of_rows=100000, streaming=True)\n",
40-
" .with_columns(\"longitude\", \"latitude\")\n",
41-
"# .with_columns(geometry_column=<geometry_column_name>\") # Replace <geometry_column_name> with the actual name of your geometry column instead of latitude and longitude columns. \n",
42-
" .load()\n",
43-
")\n",
44-
"data['longitude'] = data['longitude'].astype(float)\n",
45-
"data['latitude'] = data['latitude'].astype(float)\n",
46-
"data.to_csv(\"./pluto.csv\")"
47-
]
48-
},
4929
{
5030
"cell_type": "markdown",
5131
"metadata": {},
@@ -91,11 +71,11 @@
9171
" )\n",
9272
" .build()\n",
9373
")\n",
94-
"\n",
74+
"# Note: For the documentation interactive mode, we only query 100000 records from the dataset. Feel free to remove for a more realistic analysis.\n",
9575
"loader = (\n",
9676
" mapper\n",
9777
" .loader\n",
98-
" .from_file(\"./pluto.csv\")\n",
78+
" .from_huggingface(\"oscur/pluto\", number_of_rows=100000, streaming=True)\n",
9979
" .with_columns(\"longitude\", \"latitude\")\n",
10080
"# .with_columns(geometry_column=<geometry_column_name>\") # Replace <geometry_column_name> with the actual name of your geometry column instead of latitude and longitude columns. \n",
10181
" .build()\n",
@@ -110,7 +90,7 @@
11090
")\n",
11191
"filter_step = mapper.filter.with_type(\"BoundingBoxFilter\").build()\n",
11292
"enricher = mapper.enricher.with_data(group_by=\"nearest_intersection\", values_from=\"numfloors\").aggregate_by(method=\"mean\", output_column=\"avg_floors\").build()\n",
113-
"visualiser = mapper.visual.with_type(\"Interactive\").with_style({\"tiles\": \"CartoDB dark_matter\", \"colorbar_text_color\": \"white\"}).build()\n",
93+
"visualiser = mapper.visual.with_type(\"Interactive\").with_style({\"tiles\": \"CartoDB Positron\", \"colorbar_text_color\": \"gray\"}).build()\n",
11494
"\n",
11595
"# Assemble the pipeline\n",
11696
"# Note that a pipeline's step is a tuple with a name and the step itself.\n",
@@ -260,46 +240,18 @@
260240
" .build()\n",
261241
")\n",
262242
"\n",
263-
"## It is not possible to use from_huggingface directly in the pipeline, because the online method that supports `.build()` is from_file\n",
264-
"## This feature should be changed in the next versions\n",
265-
"data = (\n",
266-
" mapper\n",
267-
" .loader\n",
268-
" .from_huggingface(\"oscur/pluto\", number_of_rows=1000, streaming=True)\n",
269-
" .with_columns(\"longitude\", \"latitude\")\n",
270-
"# .with_columns(geometry_column=<geometry_column_name>\") # Replace <geometry_column_name> with the actual name of your geometry column instead of latitude and longitude columns. \n",
271-
" .load()\n",
272-
")\n",
273-
"data['longitude'] = data['longitude'].astype(float)\n",
274-
"data['latitude'] = data['latitude'].astype(float)\n",
275-
"data.to_csv(\"./pluto.csv\")\n",
276-
"\n",
277-
"## It is not possible to use from_huggingface directly in the pipeline, because the online method that supports `.build()` is from_file\n",
278-
"## This feature should be changed in the next versions\n",
279-
"data = (\n",
280-
" mapper\n",
281-
" .loader\n",
282-
" .from_huggingface(\"oscur/taxisvis1M\", number_of_rows=1000, streaming=True)\n",
283-
" .with_columns(\"pickup_longitude\", \"pickup_latitude\")\n",
284-
"# .with_columns(geometry_column=<geometry_column_name>\") # Replace <geometry_column_name> with the actual name of your geometry column instead of latitude and longitude columns. \n",
285-
" .load()\n",
286-
")\n",
287-
"data['pickup_longitude'] = data['pickup_longitude'].astype(float)\n",
288-
"data['pickup_latitude'] = data['pickup_latitude'].astype(float)\n",
289-
"data.to_csv(\"./taxisvis1M.csv\")\n",
290-
"\n",
291243
"loader1 = (\n",
292244
" mapper\n",
293245
" .loader\n",
294-
" .from_file(\"pluto.csv\")\n",
246+
" .from_huggingface(\"oscur/pluto\", number_of_rows=1000, streaming=True)\n",
295247
" .with_columns(\"longitude\", \"latitude\")\n",
296248
"# .with_columns(geometry_column=<geometry_column_name>\") # Replace <geometry_column_name> with the actual name of your geometry column instead of latitude and longitude columns. \n",
297249
" .build()\n",
298250
")\n",
299251
"loader2 = (\n",
300252
" mapper\n",
301253
" .loader\n",
302-
" .from_file(\"taxisvis1M.csv\")\n",
254+
" .from_huggingface(\"oscur/taxisvis1M\", number_of_rows=1000, streaming=True)\n",
303255
" .with_columns(\"pickup_longitude\", \"pickup_latitude\")\n",
304256
" .with_map({\"pickup_longitude\": \"longitude\", \"pickup_latitude\": \"latitude\"})\n",
305257
"# .with_columns(geometry_column=<geometry_column_name>\") # Replace <geometry_column_name> with the actual name of your geometry column instead of latitude and longitude columns. \n",
@@ -322,7 +274,7 @@
322274
"enricher1 = mapper.enricher.with_data(group_by=\"nearest_intersection\", values_from=\"numfloors\", data_id=\"pluto_data\").aggregate_by(method=\"mean\", output_column=\"avg_floors\").build()\n",
323275
"enricher2 = mapper.enricher.with_data(group_by=\"pickup_segment\", data_id=\"taxi_data\").count_by(output_column=\"pickup_count\").build()\n",
324276
"\n",
325-
"visualiser = mapper.visual.with_type(\"Interactive\").with_style({\"tiles\": \"CartoDB dark_matter\"}).build()\n",
277+
"visualiser = mapper.visual.with_type(\"Interactive\").with_style({\"tiles\": \"CartoDB Positron\", \"colorbar_text_color\": \"gray\"}).build()\n",
326278
"\n",
327279
"# Assemble the pipeline\n",
328280
"# Note that a pipeline's step is a tuple with a name and the step itself.\n",

docs/copy_of_examples/2-End-to-End/2-pipeline_way.ipynb

Lines changed: 5 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -29,28 +29,6 @@
2929
"um = UrbanMapper()"
3030
]
3131
},
32-
{
33-
"cell_type": "code",
34-
"execution_count": null,
35-
"metadata": {},
36-
"outputs": [],
37-
"source": [
38-
"# Note: For the documentation interactive mode, we only query 5000 records from the dataset. Feel free to remove for a more realistic analysis.\n",
39-
"data = (\n",
40-
" UrbanMapper()\n",
41-
" .loader\n",
42-
" .from_huggingface(\"oscur/pluto\", number_of_rows=5000, streaming=True)\n",
43-
" .with_columns(longitude_column=\"longitude\", latitude_column=\"latitude\")\n",
44-
"# .with_columns(geometry_column=<geometry_column_name>\") # Replace <geometry_column_name> with the actual name of your geometry column instead of latitude and longitude columns.\n",
45-
" .load()\n",
46-
")\n",
47-
"\n",
48-
"data['longitude'] = data['longitude'].astype(float)\n",
49-
"data['latitude'] = data['latitude'].astype(float)\n",
50-
"\n",
51-
"data.to_csv(\"pluto.csv\")"
52-
]
53-
},
5432
{
5533
"cell_type": "markdown",
5634
"metadata": {},
@@ -90,9 +68,11 @@
9068
" ) # Recall that with mapping is to tell `map_nearest_layer` how it should map the urban data with the urban layer.\n",
9169
" .build()\n",
9270
")\n",
93-
"\n",
71+
"# Note: For the documentation interactive mode, we only query 5000 records from the dataset. Feel free to remove for a more realistic analysis.\n",
9472
"loader = (\n",
95-
" um.loader.from_file(\"./pluto.csv\")\n",
73+
" um\n",
74+
" .loader\n",
75+
" .from_huggingface(\"oscur/pluto\", number_of_rows=5000, streaming=True)\n",
9676
" .with_columns(longitude_column=\"longitude\", latitude_column=\"latitude\")\n",
9777
"# .with_columns(geometry_column=<geometry_column_name>\") # Replace <geometry_column_name> with the actual name of your geometry column instead of latitude and longitude columns.\n",
9878
" .build()\n",
@@ -115,7 +95,7 @@
11595
"\n",
11696
"visualiser = (\n",
11797
" um.visual.with_type(\"Interactive\")\n",
118-
" .with_style({\"tiles\": \"CartoDB dark_matter\", \"colorbar_text_color\": \"white\"})\n",
98+
" .with_style({\"tiles\": \"CartoDB Positron\", \"colorbar_text_color\": \"gray\"})\n",
11999
" .build()\n",
120100
")\n",
121101
"\n",

docs/copy_of_examples/3-Case-Studies/1-Downtown-BK-Collisions/2-Downtown_BK_Collisions_Pipeline.ipynb

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,21 +20,7 @@
2020
"outputs": [],
2121
"source": [
2222
"import urban_mapper as um\n",
23-
"from urban_mapper.pipeline import UrbanPipeline\n",
24-
"\n",
25-
"# Note: For the documentation interactive mode, we only query 5000 records from the dataset. Feel free to remove for a more realistic analysis.\n",
26-
"data = (\n",
27-
" um.UrbanMapper()\n",
28-
" .loader\n",
29-
" .from_huggingface(\"oscur/NYC_vehicle_collisions\", number_of_rows=5000, streaming=True)\n",
30-
" .with_columns(longitude_column=\"LONGITUDE\", latitude_column=\"LATITUDE\")\n",
31-
" .load()\n",
32-
")\n",
33-
"\n",
34-
"data['LONGITUDE'] = data['LONGITUDE'].astype(float)\n",
35-
"data['LATITUDE'] = data['LATITUDE'].astype(float)\n",
36-
"\n",
37-
"data.to_csv(\"./NYC_Motor_Vehicle_Collisions_Mar_12_2025.csv\")"
23+
"from urban_mapper.pipeline import UrbanPipeline"
3824
]
3925
},
4026
{
@@ -59,10 +45,11 @@
5945
" )\n",
6046
" .build()\n",
6147
" )),\n",
48+
" # Note: For the documentation interactive mode, we only query 5000 records from the dataset. Feel free to remove for a more realistic analysis.\n",
6249
" (\"loader\", (\n",
6350
" mapper\n",
6451
" .loader\n",
65-
" .from_file(\"./NYC_Motor_Vehicle_Collisions_Mar_12_2025.csv\")\n",
52+
" .from_huggingface(\"oscur/NYC_vehicle_collisions\", number_of_rows=5000, streaming=True)\n",
6653
" .with_columns(longitude_column=\"LONGITUDE\", latitude_column=\"LATITUDE\")\n",
6754
" .build()\n",
6855
" )),\n",
@@ -85,7 +72,7 @@
8572
" mapper\n",
8673
" .visual\n",
8774
" .with_type(\"Interactive\")\n",
88-
" .with_style({\"tiles\": \"CartoDB dark_matter\", \"colorbar_text_color\": \"white\"})\n",
75+
" .with_style({\"tiles\": \"CartoDB Positron\", \"colorbar_text_color\": \"gray\"}) \n",
8976
" .build()\n",
9077
" ))\n",
9178
"])"

docs/copy_of_examples/3-Case-Studies/1-Downtown-BK-Collisions/3-Downtown_BK_Collisions_Advanced_Pipeline.ipynb

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,29 +13,6 @@
1313
"\n"
1414
]
1515
},
16-
{
17-
"cell_type": "code",
18-
"execution_count": null,
19-
"metadata": {},
20-
"outputs": [],
21-
"source": [
22-
"import urban_mapper as um\n",
23-
"from urban_mapper.pipeline import UrbanPipeline\n",
24-
"\n",
25-
"data = (\n",
26-
" um.UrbanMapper()\n",
27-
" .loader\n",
28-
" .from_huggingface(\"oscur/NYC_vehicle_collisions\")\n",
29-
" .with_columns(longitude_column=\"LONGITUDE\", latitude_column=\"LATITUDE\")\n",
30-
" .load()\n",
31-
")\n",
32-
"\n",
33-
"data['LONGITUDE'] = data['LONGITUDE'].astype(float)\n",
34-
"data['LATITUDE'] = data['LATITUDE'].astype(float)\n",
35-
"\n",
36-
"data.to_csv(\"./NYC_Motor_Vehicle_Collisions_Mar_12_2025.csv\")"
37-
]
38-
},
3916
{
4017
"cell_type": "code",
4118
"execution_count": null,
@@ -60,7 +37,7 @@
6037
" )),\n",
6138
" (\"loader\", (\n",
6239
" um.UrbanMapper().loader\n",
63-
" .from_file(\"./NYC_Motor_Vehicle_Collisions_Mar_12_2025.csv\")\n",
40+
" .from_huggingface(\"oscur/NYC_vehicle_collisions\")\n",
6441
" .with_columns(longitude_column=\"LONGITUDE\", latitude_column=\"LATITUDE\")\n",
6542
" .build()\n",
6643
" )),\n",
@@ -86,7 +63,7 @@
8663
" (\"visualiser\", (\n",
8764
" um.UrbanMapper().visual\n",
8865
" .with_type(\"Interactive\")\n",
89-
" .with_style({\"tiles\": \"CartoDB dark_matter\", \"colorbar_text_color\": \"white\"})\n",
66+
" .with_style({\"tiles\": \"CartoDB Positron\", \"colorbar_text_color\": \"gray\"})\n",
9067
" .build()\n",
9168
" ))\n",
9269
"])"

docs/copy_of_examples/3-Case-Studies/1-Downtown-BK-Collisions/4-Downtown_BK_Collisions_Advanced_Pipeline_Extras.ipynb

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24,29 +24,6 @@
2424
"import pandas as pd"
2525
]
2626
},
27-
{
28-
"cell_type": "code",
29-
"execution_count": null,
30-
"metadata": {},
31-
"outputs": [],
32-
"source": [
33-
"import urban_mapper as um\n",
34-
"from urban_mapper.pipeline import UrbanPipeline\n",
35-
"\n",
36-
"data = (\n",
37-
" um.UrbanMapper()\n",
38-
" .loader\n",
39-
" .from_huggingface(\"oscur/NYC_vehicle_collisions\")\n",
40-
" .with_columns(longitude_column=\"LONGITUDE\", latitude_column=\"LATITUDE\")\n",
41-
" .load()\n",
42-
")\n",
43-
"\n",
44-
"data['LONGITUDE'] = data['LONGITUDE'].astype(float)\n",
45-
"data['LATITUDE'] = data['LATITUDE'].astype(float)\n",
46-
"\n",
47-
"data.to_csv(\"./NYC_Motor_Vehicle_Collisions_Mar_12_2025.csv\")"
48-
]
49-
},
5027
{
5128
"cell_type": "code",
5229
"execution_count": null,
@@ -156,7 +133,7 @@
156133
" )),\n",
157134
" (\"loader\", (\n",
158135
" um.UrbanMapper().loader\n",
159-
" .from_file(\"./NYC_Motor_Vehicle_Collisions_Mar_12_2025.csv\")\n",
136+
" .from_huggingface(\"oscur/NYC_vehicle_collisions\")\n",
160137
" .with_columns(longitude_column=\"LONGITUDE\", latitude_column=\"LATITUDE\")\n",
161138
" .build()\n",
162139
" )),\n",
@@ -236,7 +213,7 @@
236213
" (\"visualiser\", (\n",
237214
" um.UrbanMapper().visual\n",
238215
" .with_type(\"Interactive\")\n",
239-
" .with_style({\"tiles\": \"CartoDB dark_matter\", \"colorbar_text_color\": \"white\"})\n",
216+
" .with_style({\"tiles\": \"CartoDB Positron\", \"colorbar_text_color\": \"gray\"})\n",
240217
" .build()\n",
241218
" ))\n",
242219
"])"

0 commit comments

Comments
 (0)