Skip to content

Commit 41ff37b

Browse files
committed
Add get_hf_hub and make_safe_directory_name
1 parent 3ce57f7 commit 41ff37b

File tree

2 files changed

+54
-0
lines changed

2 files changed

+54
-0
lines changed

pythainlp/corpus/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
"get_corpus_default_db",
2424
"get_corpus_path",
2525
"get_path_folder_corpus",
26+
"get_hf_hub",
2627
"path_pythainlp_corpus",
2728
"provinces",
2829
"remove",
@@ -41,6 +42,7 @@
4142
"thai_wikipedia_titles",
4243
"thai_words",
4344
"thai_wsd_dict",
45+
"make_safe_directory_name",
4446
]
4547

4648
import os
@@ -98,6 +100,8 @@ def corpus_db_path() -> str:
98100
get_corpus_default_db,
99101
get_corpus_path,
100102
get_path_folder_corpus,
103+
make_safe_directory_name,
104+
get_hf_hub,
101105
path_pythainlp_corpus,
102106
remove,
103107
) # these imports must come before other pythainlp.corpus.* imports

pythainlp/corpus/core.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import json
1010
import os
11+
import re
1112
from typing import Union
1213

1314
from pythainlp import __version__
@@ -614,3 +615,52 @@ def remove(name: str) -> bool:
614615

615616
def get_path_folder_corpus(name, version, *path):
616617
return os.path.join(get_corpus_path(name, version), *path)
618+
619+
620+
def make_safe_directory_name(name:str) -> str:
621+
"""
622+
Make safe directory name
623+
624+
:param str name: directory name
625+
:return: safe directory name
626+
:rtype: str
627+
"""
628+
# Replace invalid characters with an underscore
629+
safe_name = re.sub(r'[<>:"/\\|?*]', '_', name)
630+
# Remove leading/trailing spaces or periods (especially important for Windows)
631+
safe_name = safe_name.strip(' .')
632+
# Prevent names that are reserved on Windows
633+
reserved_names = ['CON', 'PRN', 'AUX', 'NUL', 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9']
634+
if safe_name.upper() in reserved_names:
635+
safe_name = f"_{safe_name}" # Prepend underscore to avoid conflict
636+
return safe_name
637+
638+
639+
def get_hf_hub(repo_id:str, filename: str=None) -> str:
640+
"""
641+
HuggingFace Hub in pythainlp-home
642+
643+
:param str repo_id: repo_id
644+
:param str filename: filename
645+
:return: path
646+
:rtype: str
647+
"""
648+
if _CHECK_MODE == "1":
649+
print("PyThaiNLP is read-only mode. It can't download.")
650+
return False
651+
from huggingface_hub import hf_hub_download, snapshot_download
652+
hf_root = get_full_data_path("hf_models")
653+
name_dir = make_safe_directory_name(repo_id)
654+
root_project = os.path.join(hf_root, name_dir)
655+
if filename!=None:
656+
output_path = hf_hub_download(
657+
repo_id=repo_id,
658+
filename=filename,
659+
local_dir=root_project
660+
)
661+
else:
662+
output_path = snapshot_download(
663+
repo_id=repo_id,
664+
local_dir=root_project
665+
)
666+
return output_path

0 commit comments

Comments
 (0)