Skip to content

Commit a139b1b

Browse files
aecornmmwinther
andauthored
adding naming validation to see if filename short description contains anything else than alnum or dash (#386)
* adding test to see if filename short description contains anything else than alnum or dash * whops wrong function name * not sure why test is failing more * doctest function names * pre-commit * triple negation shooting me in the foot * remove debug prints * wrong assumption about underscores in tests * Update src/dapla_metadata/standards/name_validator.py Co-authored-by: Miles Mason Winther <42948872+mmwinther@users.noreply.github.com> * Update src/dapla_metadata/standards/utils/constants.py Co-authored-by: Miles Mason Winther <42948872+mmwinther@users.noreply.github.com> * rename function everywhere --------- Co-authored-by: Miles Mason Winther <42948872+mmwinther@users.noreply.github.com>
1 parent 6d02d62 commit a139b1b

File tree

3 files changed

+53
-26
lines changed

3 files changed

+53
-26
lines changed

src/dapla_metadata/standards/name_validator.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from dapla_metadata.datasets.dapla_dataset_path_info import DaplaDatasetPathInfo
1010
from dapla_metadata.datasets.dataset_parser import SUPPORTED_DATASET_FILE_SUFFIXES
11+
from dapla_metadata.standards.utils.constants import DESCRIPTION_OTHER_THAN_DASHES
1112
from dapla_metadata.standards.utils.constants import FILE_DOES_NOT_EXIST
1213
from dapla_metadata.standards.utils.constants import FILE_IGNORED
1314
from dapla_metadata.standards.utils.constants import IGNORED_FOLDERS
@@ -163,6 +164,27 @@ def _has_invalid_symbols(path: ReadablePathLike) -> bool:
163164
return bool(re.search(r"[^a-zA-Z0-9\./:_\-=]", str(path).strip()))
164165

165166

167+
def _short_name_has_illegal_chars(dataset_short_name: str | None) -> bool:
168+
"""Return True if short name contains anything else than letters, digits or dashes (no underscores allowed).
169+
170+
Examples:
171+
>>> _short_name_has_illegal_chars("åregang-øre") # å and ø not allowed
172+
True
173+
174+
>>> _short_name_has_illegal_chars("Azor89")
175+
False
176+
177+
>>> _short_name_has_illegal_chars("skjema_2_p2018_p2020_v1") # Because 2 is considered part of shortname, and seperated by underscore.
178+
True
179+
180+
>>> _short_name_has_illegal_chars("data.parquet") # Returns True because . was sent in, should have been stripped into dataset_short_name.
181+
True
182+
"""
183+
if dataset_short_name is None or not dataset_short_name:
184+
return False
185+
return bool(re.search(r"[^a-zA-Z0-9\-]", str(dataset_short_name).strip()))
186+
187+
166188
def _check_violations(
167189
file: UPath,
168190
) -> list[str]:
@@ -174,6 +196,9 @@ def _check_violations(
174196
MISSING_PERIOD: path_info.contains_data_from,
175197
MISSING_DATASET_SHORT_NAME: path_info.dataset_short_name,
176198
INVALID_SYMBOLS: not _has_invalid_symbols(file),
199+
DESCRIPTION_OTHER_THAN_DASHES: not _short_name_has_illegal_chars(
200+
path_info.dataset_short_name
201+
),
177202
}
178203

179204
return [message for message, value in checks.items() if not value]

src/dapla_metadata/standards/utils/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
MISSING_DATASET_SHORT_NAME = "Filnavn mangler datasett kortnavn ref: https://manual.dapla.ssb.no/statistikkere/navnestandard.html#filnavn"
1717

1818
INVALID_SYMBOLS = "Filnavn inneholder ulovlige tegn ref: https://manual.dapla.ssb.no/statistikkere/navnestandard.html#filnavn"
19+
DESCRIPTION_OTHER_THAN_DASHES = "Datasett kortnavn inneholder ulovlige tegn ref: https://manual.dapla.ssb.no/statistikkere/navnestandard.html#filnavn"
1920

2021
PATH_IGNORED = "Ignorert, mappen er ikke underlagt krav til navnestandard."
2122
FILE_IGNORED = f"Ignorert, kun datasett med {', '.join(SUPPORTED_DATASET_FILE_SUFFIXES.keys())} filendelser valideres foreløpig."

tests/standards/test_check_naming_standard.py

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
@pytest.mark.parametrize(
2323
("file_path"),
2424
[
25-
"buckets/data/sirkus/utdata/person_data_p2021_v2.parquet",
25+
"buckets/data/sirkus/utdata/person-data_p2021_v2.parquet",
2626
],
2727
)
2828
@pytest.mark.asyncio
@@ -36,8 +36,8 @@ async def test_non_existent_path(file_path: str):
3636
@pytest.mark.parametrize(
3737
("file_path"),
3838
[
39-
"ssb-staging-dapla-felles-data-delt/datadoc/utdata/person_data_v1.parquet",
40-
"dataset/klargjorte_data/arbmark/resources/person_data_v1.parquet",
39+
"ssb-staging-dapla-felles-data-delt/datadoc/utdata/person-data_v1.parquet",
40+
"dataset/klargjorte_data/arbmark/resources/person-data_v1.parquet",
4141
],
4242
)
4343
@pytest.mark.asyncio
@@ -53,9 +53,9 @@ async def test_missing_date_period(file_path, tmp_path):
5353
@pytest.mark.parametrize(
5454
("file_path"),
5555
[
56-
"gs://ssb-staging-dapla-felles-data-delt/stat_reg/person_data_p2022_v1.parquet",
57-
"gs://ssb-staging-dapla-felles-data-delt/datadoc/person_data_p2021_v3.parquet",
58-
"buckets/produkt/test-2/person_testdata_p2021-12-31_p2021-12-31_v1.parquet",
56+
"gs://ssb-staging-dapla-felles-data-delt/stat_reg/person-data_p2022_v1.parquet",
57+
"gs://ssb-staging-dapla-felles-data-delt/datadoc/person-data_p2021_v3.parquet",
58+
"buckets/produkt/test-2/person-testdata_p2021-12-31_p2021-12-31_v1.parquet",
5959
],
6060
)
6161
@pytest.mark.asyncio
@@ -72,10 +72,10 @@ async def test_missing_data_state(file_path, tmp_path):
7272
@pytest.mark.parametrize(
7373
("file_path"),
7474
[
75-
"buckets/stat/inndata/person_data_p2022_v1.parquet",
76-
"gs://ssb-staging-dapla-felles-data-delt/inndata/person_data_p2022_v1.parquet",
77-
"gs://ssb-staging-dapla-felles-data-delt/klargjorte-data/person_data_p2021_v3.parquet",
78-
"buckets/produkt/utdata/person_testdata_p2021-12-31_p2021-12-31_v1.parquet",
75+
"buckets/stat/inndata/person-data_p2022_v1.parquet",
76+
"gs://ssb-staging-dapla-felles-data-delt/inndata/person-data_p2022_v1.parquet",
77+
"gs://ssb-staging-dapla-felles-data-delt/klargjorte-data/person-data_p2021_v3.parquet",
78+
"buckets/produkt/utdata/person-testdata_p2021-12-31_p2021-12-31_v1.parquet",
7979
],
8080
)
8181
@pytest.mark.asyncio
@@ -111,8 +111,8 @@ async def test_ignored_paths(file_path, tmp_path):
111111
@pytest.mark.parametrize(
112112
("file_path"),
113113
[
114-
"buckets/ssb-dapla-example-data-produkt-prod/ledstill/utdata/persån_testdata_p2021-12-31_p2021-12-31_v1.parquet",
115-
"gs://ssb-dapla-example-data-prædukt-prod/ledstill/utdata/person_testdata_p2021-12-31_p2021-12-31_v1.parquet",
114+
"buckets/ssb-dapla-example-data-produkt-prod/ledstill/utdata/persån-testdata_p2021-12-31_p2021-12-31_v1.parquet",
115+
"gs://ssb-dapla-example-data-prædukt-prod/ledstill/utdata/person-testdata_p2021-12-31_p2021-12-31_v1.parquet",
116116
],
117117
)
118118
@pytest.mark.asyncio
@@ -190,13 +190,13 @@ async def test_missing_multiple(file_path: str, violations: list, tmp_path):
190190
@pytest.mark.parametrize(
191191
("file_path"),
192192
[
193-
"buckets/produkt/datadoc/utdata/person_data_p2021_v2.parquet",
194-
"produkt/datadoc/utdata/person_data_p2021_p2022_v2.parquet",
195-
"datadoc/utdata/undermappe/person_data_p2021_v2.parquet",
196-
"delt-data/dataset/klargjorte_data/arbmark/resources/person_data_p2021-12-31_p2021-12-31_v1.parquet",
197-
"stat/inndata/person_testdata_p2021-12-31_p2021-12-31_v1.parquet",
198-
"ssb-delt/stat/klargjorte-data/person_testdata_p2021-12-31_p2021-12-31_v1.parquet",
199-
"produkt-delt/datadoc/brukertest/1/sykefratot/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1.parquet",
193+
"buckets/produkt/datadoc/utdata/person-data_p2021_v2.parquet",
194+
"produkt/datadoc/utdata/person-data_p2021_p2022_v2.parquet",
195+
"datadoc/utdata/undermappe/person-data_p2021_v2.parquet",
196+
"delt-data/dataset/klargjorte_data/arbmark/resources/person-data_p2021-12-31_p2021-12-31_v1.parquet",
197+
"stat/inndata/person-testdata_p2021-12-31_p2021-12-31_v1.parquet",
198+
"ssb-delt/stat/klargjorte-data/person-testdata_p2021-12-31_p2021-12-31_v1.parquet",
199+
"produkt-delt/datadoc/brukertest/1/sykefratot/klargjorte_data/person-testdata_p2021-12-31_p2021-12-31_v1.parquet",
200200
],
201201
)
202202
@pytest.mark.asyncio
@@ -219,8 +219,8 @@ async def test_check_naming_standard_specific_file_path(
219219
@pytest.mark.parametrize(
220220
("file_path"),
221221
[
222-
"buckets/produkt/datadoc/utdata/person_data_p2021_v2.csv",
223-
"samfunns-produkt/datadoc/brukertest/1/sykefratot/klargjorte_data/person_testdata_p2021-12-31_p2021-12-31_v1.json",
222+
"buckets/produkt/datadoc/utdata/person-data_p2021_v2.csv",
223+
"samfunns-produkt/datadoc/brukertest/1/sykefratot/klargjorte_data/person-testdata_p2021-12-31_p2021-12-31_v1.json",
224224
],
225225
)
226226
@pytest.mark.asyncio
@@ -250,8 +250,8 @@ async def test_check_naming_standard_ignored_file_type(
250250
(
251251
"stat_reg",
252252
[
253-
("person_data_p2022_v1.parquet", NAME_STANDARD_SUCCESS),
254-
("bil_data_p2022_v1.parquet", NAME_STANDARD_SUCCESS),
253+
("person-data_p2022_v1.parquet", NAME_STANDARD_SUCCESS),
254+
("bil-data_p2022_v1.parquet", NAME_STANDARD_SUCCESS),
255255
("my%stuff.csv", FILE_IGNORED),
256256
],
257257
"ssb-staging-dapla-felles-data-delt",
@@ -260,7 +260,7 @@ async def test_check_naming_standard_ignored_file_type(
260260
"temp_stuff",
261261
[
262262
("_p2022_v1.parquet", MISSING_DATASET_SHORT_NAME),
263-
("bil_data_v1.parquet", MISSING_PERIOD),
263+
("bil-data_v1.parquet", MISSING_PERIOD),
264264
("my%stuff.csv", FILE_IGNORED),
265265
],
266266
"ssb-staging-dapla-felles-data-delt",
@@ -350,6 +350,7 @@ async def test_generate_naming_standard_report(tmp_path):
350350
"buckets/ssb-dapla-example-data-produkt-prod/utdata/editert_v1.parquet",
351351
"buckets/ssb-dapla-example-data-produkt-prod/klargjorte_data/_p2021-12-31_p2021-12-31_v1.parquet",
352352
"buckets/ssb-dapla-example-data-produkt-prod/ledstill/klargjorte_data/park_p2021-12-31_p2021-12-31_v1.parquet",
353+
"buckets/ssb-dapla-example-data-produkt-prod/ledstill/park_wrongunderscore_v1.parquet",
353354
]
354355
for file_path in file_paths:
355356
full_path = tmp_path / file_path
@@ -362,8 +363,8 @@ async def test_generate_naming_standard_report(tmp_path):
362363

363364
if isinstance(results, list):
364365
report = generate_validation_report(validation_results=results)
365-
assert report.num_failures == 3
366-
assert report.num_files_validated == 5
366+
assert report.num_failures == 4
367+
assert report.num_files_validated == 6
367368
assert report.num_success == 2
368369

369370

0 commit comments

Comments
 (0)