|
5 | 5 | <head> |
6 | 6 | <meta charset="utf-8" /> |
7 | 7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
8 | | - <title>pythainlp.corpus.core — PyThaiNLP 3ce57f7 documentation</title> |
| 8 | + <title>pythainlp.corpus.core — PyThaiNLP f7df55d documentation</title> |
9 | 9 | <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=03e43079" /> |
10 | 10 | <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" /> |
11 | 11 | <link rel="stylesheet" type="text/css" href="../../../_static/style.css?v=eea1f72d" /> |
12 | 12 |
|
13 | 13 |
|
14 | 14 | <script src="../../../_static/jquery.js?v=5d32c60e"></script> |
15 | 15 | <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script> |
16 | | - <script src="../../../_static/documentation_options.js?v=2a7bd371"></script> |
| 16 | + <script src="../../../_static/documentation_options.js?v=e5bf50f9"></script> |
17 | 17 | <script src="../../../_static/doctools.js?v=9bcbadda"></script> |
18 | 18 | <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script> |
19 | 19 | <script src="../../../_static/js/theme.js"></script> |
@@ -116,6 +116,7 @@ <h1>Source code for pythainlp.corpus.core</h1><div class="highlight"><pre> |
116 | 116 |
|
117 | 117 | <span class="kn">import</span><span class="w"> </span><span class="nn">json</span> |
118 | 118 | <span class="kn">import</span><span class="w"> </span><span class="nn">os</span> |
| 119 | +<span class="kn">import</span><span class="w"> </span><span class="nn">re</span> |
119 | 120 | <span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Union</span> |
120 | 121 |
|
121 | 122 | <span class="kn">from</span><span class="w"> </span><span class="nn">pythainlp</span><span class="w"> </span><span class="kn">import</span> <span class="n">__version__</span> |
@@ -715,9 +716,6 @@ <h1>Source code for pythainlp.corpus.core</h1><div class="highlight"><pre> |
715 | 716 | <span class="sd"> # FileNotFoundError: [Errno 2] No such file or directory:</span> |
716 | 717 | <span class="sd"> # '/usr/local/lib/python3.6/dist-packages/pythainlp/corpus/ttc'</span> |
717 | 718 | <span class="sd"> """</span> |
718 | | - <span class="k">if</span> <span class="n">_CHECK_MODE</span> <span class="o">==</span> <span class="s2">"1"</span><span class="p">:</span> |
719 | | - <span class="nb">print</span><span class="p">(</span><span class="s2">"PyThaiNLP is read-only mode. It can't remove corpus."</span><span class="p">)</span> |
720 | | - <span class="k">return</span> <span class="kc">False</span> |
721 | 719 | <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">corpus_db_path</span><span class="p">(),</span> <span class="s2">"r"</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s2">"utf-8-sig"</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span> |
722 | 720 | <span class="n">db</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> |
723 | 721 | <span class="n">data</span> <span class="o">=</span> <span class="p">[</span> |
@@ -746,6 +744,63 @@ <h1>Source code for pythainlp.corpus.core</h1><div class="highlight"><pre> |
746 | 744 |
|
747 | 745 | <span class="k">def</span><span class="w"> </span><span class="nf">get_path_folder_corpus</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">version</span><span class="p">,</span> <span class="o">*</span><span class="n">path</span><span class="p">):</span> |
748 | 746 | <span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">get_corpus_path</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">version</span><span class="p">),</span> <span class="o">*</span><span class="n">path</span><span class="p">)</span> |
| 747 | + |
| 748 | + |
| 749 | +<span class="k">def</span><span class="w"> </span><span class="nf">make_safe_directory_name</span><span class="p">(</span><span class="n">name</span><span class="p">:</span><span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| 750 | +<span class="w"> </span><span class="sd">"""</span> |
| 751 | +<span class="sd"> Make safe directory name</span> |
| 752 | + |
| 753 | +<span class="sd"> :param str name: directory name</span> |
| 754 | +<span class="sd"> :return: safe directory name</span> |
| 755 | +<span class="sd"> :rtype: str</span> |
| 756 | +<span class="sd"> """</span> |
| 757 | + <span class="c1"># Replace invalid characters with an underscore</span> |
| 758 | + <span class="n">safe_name</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="sa">r</span><span class="s1">'[<>:"/</span><span class="se">\\</span><span class="s1">|?*]'</span><span class="p">,</span> <span class="s1">'_'</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span> |
| 759 | + <span class="c1"># Remove leading/trailing spaces or periods (especially important for Windows)</span> |
| 760 | + <span class="n">safe_name</span> <span class="o">=</span> <span class="n">safe_name</span><span class="o">.</span><span class="n">strip</span><span class="p">(</span><span class="s1">' .'</span><span class="p">)</span> |
| 761 | + <span class="c1"># Prevent names that are reserved on Windows</span> |
| 762 | + <span class="n">reserved_names</span> <span class="o">=</span> <span class="p">[</span><span class="s1">'CON'</span><span class="p">,</span> <span class="s1">'PRN'</span><span class="p">,</span> <span class="s1">'AUX'</span><span class="p">,</span> <span class="s1">'NUL'</span><span class="p">,</span> <span class="s1">'COM1'</span><span class="p">,</span> <span class="s1">'COM2'</span><span class="p">,</span> <span class="s1">'COM3'</span><span class="p">,</span> <span class="s1">'COM4'</span><span class="p">,</span> <span class="s1">'COM5'</span><span class="p">,</span> <span class="s1">'COM6'</span><span class="p">,</span> <span class="s1">'COM7'</span><span class="p">,</span> <span class="s1">'COM8'</span><span class="p">,</span> <span class="s1">'COM9'</span><span class="p">,</span> <span class="s1">'LPT1'</span><span class="p">,</span> <span class="s1">'LPT2'</span><span class="p">,</span> <span class="s1">'LPT3'</span><span class="p">,</span> <span class="s1">'LPT4'</span><span class="p">,</span> <span class="s1">'LPT5'</span><span class="p">,</span> <span class="s1">'LPT6'</span><span class="p">,</span> <span class="s1">'LPT7'</span><span class="p">,</span> <span class="s1">'LPT8'</span><span class="p">,</span> <span class="s1">'LPT9'</span><span class="p">]</span> |
| 763 | + <span class="k">if</span> <span class="n">safe_name</span><span class="o">.</span><span class="n">upper</span><span class="p">()</span> <span class="ow">in</span> <span class="n">reserved_names</span><span class="p">:</span> |
| 764 | + <span class="n">safe_name</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"_</span><span class="si">{</span><span class="n">safe_name</span><span class="si">}</span><span class="s2">"</span> <span class="c1"># Prepend underscore to avoid conflict</span> |
| 765 | + <span class="k">return</span> <span class="n">safe_name</span> |
| 766 | + |
| 767 | + |
| 768 | +<span class="k">def</span><span class="w"> </span><span class="nf">get_hf_hub</span><span class="p">(</span><span class="n">repo_id</span><span class="p">:</span><span class="nb">str</span><span class="p">,</span> <span class="n">filename</span><span class="p">:</span> <span class="nb">str</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| 769 | +<span class="w"> </span><span class="sd">"""</span> |
| 770 | +<span class="sd"> HuggingFace Hub in :mod:`pythainlp` data directory.</span> |
| 771 | + |
| 772 | +<span class="sd"> :param str repo_id: repo_id</span> |
| 773 | +<span class="sd"> :param str filename: filename</span> |
| 774 | +<span class="sd"> :return: path</span> |
| 775 | +<span class="sd"> :rtype: str</span> |
| 776 | +<span class="sd"> """</span> |
| 777 | + <span class="k">if</span> <span class="n">_CHECK_MODE</span> <span class="o">==</span> <span class="s2">"1"</span><span class="p">:</span> |
| 778 | + <span class="nb">print</span><span class="p">(</span><span class="s2">"PyThaiNLP is read-only mode. It can't download."</span><span class="p">)</span> |
| 779 | + <span class="k">return</span> <span class="kc">False</span> |
| 780 | + <span class="k">try</span><span class="p">:</span> |
| 781 | + <span class="kn">from</span><span class="w"> </span><span class="nn">huggingface_hub</span><span class="w"> </span><span class="kn">import</span> <span class="n">hf_hub_download</span><span class="p">,</span> <span class="n">snapshot_download</span> |
| 782 | + <span class="k">except</span> <span class="ne">ModuleNotFoundError</span><span class="p">:</span> |
| 783 | + <span class="k">raise</span> <span class="ne">ModuleNotFoundError</span><span class="p">(</span><span class="s2">"""</span> |
| 784 | +<span class="s2"> huggingface-hub isn't found!</span> |
| 785 | +<span class="s2"> Please installing the package via 'pip install huggingface-hub'.</span> |
| 786 | +<span class="s2"> """</span><span class="p">)</span> |
| 787 | + <span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span> |
| 788 | + <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span><span class="sa">f</span><span class="s2">"An unexpected error occurred: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> |
| 789 | + <span class="n">hf_root</span> <span class="o">=</span> <span class="n">get_full_data_path</span><span class="p">(</span><span class="s2">"hf_models"</span><span class="p">)</span> |
| 790 | + <span class="n">name_dir</span> <span class="o">=</span> <span class="n">make_safe_directory_name</span><span class="p">(</span><span class="n">repo_id</span><span class="p">)</span> |
| 791 | + <span class="n">root_project</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">hf_root</span><span class="p">,</span> <span class="n">name_dir</span><span class="p">)</span> |
| 792 | + <span class="k">if</span> <span class="n">filename</span><span class="o">!=</span><span class="kc">None</span><span class="p">:</span> |
| 793 | + <span class="n">output_path</span> <span class="o">=</span> <span class="n">hf_hub_download</span><span class="p">(</span> |
| 794 | + <span class="n">repo_id</span><span class="o">=</span><span class="n">repo_id</span><span class="p">,</span> |
| 795 | + <span class="n">filename</span><span class="o">=</span><span class="n">filename</span><span class="p">,</span> |
| 796 | + <span class="n">local_dir</span><span class="o">=</span><span class="n">root_project</span> |
| 797 | + <span class="p">)</span> |
| 798 | + <span class="k">else</span><span class="p">:</span> |
| 799 | + <span class="n">output_path</span> <span class="o">=</span> <span class="n">snapshot_download</span><span class="p">(</span> |
| 800 | + <span class="n">repo_id</span><span class="o">=</span><span class="n">repo_id</span><span class="p">,</span> |
| 801 | + <span class="n">local_dir</span><span class="o">=</span><span class="n">root_project</span> |
| 802 | + <span class="p">)</span> |
| 803 | + <span class="k">return</span> <span class="n">output_path</span> |
749 | 804 | </pre></div> |
750 | 805 |
|
751 | 806 | </div> |
|
0 commit comments