diff --git a/README.rst b/README.rst index cc79414..b96828e 100644 --- a/README.rst +++ b/README.rst @@ -105,3 +105,24 @@ Usage ===== Check `usage page `_ + + +Docker +====== +A docker for the frontend is available at https://hub.docker.com/repository/docker/linhdpostdata/averell-ui/ + +This is a frontend to the POSTDATA group tool Averell. In order to install and run it, follow these steps: + +* Download Docker desktop from: https://www.docker.com/products/docker-desktop/ + +* Open the Docker desktop app + +* From the Docker Dashboard you can use Quick Search, which is located in the Dashboard header, to search for: + * Any container or Compose app on your local system. You can see an overview of associated environment variables or perform quick actions, such as start, stop, or delete. + * Public Docker Hub images, local images, and images from remote repositories. Depending on the type of image you select, you can either pull the image by tag, view documentation, go to Docker Hub for more details, or run a new container using the image. + * Extensions. From here, you can learn more about the extension and install it with a single click. Or, if you already have an extension installed, you can open it straight from the search results. + * Any volume. From here you can view the associated container. + +* Search for linhdpostdata/averell-ui and download it. + +* Run the container and enter url 127.0.0.1:5741 in your browser to access the UI. diff --git a/app.py b/app.py new file mode 100644 index 0000000..d7832f6 --- /dev/null +++ b/app.py @@ -0,0 +1,226 @@ +import copy +import json + +from pathlib import Path +from zipfile import ZipFile + +import gradio as gr +import pycountry + +from averell.utils import get_ids, generate_tei, CORPORA_SOURCES +from averell.core import export_corpora + +PARAMS = { + "granularity": "poem", + "output_format": "JSON", + "corpora_list": [ + 'bibit', 'stichopt', 'disco2_1', + 'disco3', 'adso', 'adso100', + 'plc', 'gongo', 'ecpa', + '4b4v', 'czverse', 'mel' + ], +} + +def get_available_languages(sources): + available_langs = [] + for c in sources: + lg = c["properties"]["language"] + if lg not in available_langs: + available_langs.append(lg) + return available_langs + +def filter_corpus_language(sources, lg): + corpora_sources = copy.deepcopy(sources) + filtered_corpora = [c for c in corpora_sources if c["properties"]["language"] == lg] + return filtered_corpora + +available_languages = get_available_languages(CORPORA_SOURCES) + +with gr.Blocks() as app_averell: + def export(output_path, output_format, output_granularity): + corpora_list = PARAMS["corpora_list"] + if not corpora_list: + return {out_file: gr.File.update(label="ERROR: No corpus selected")} + output_path = output_path["label"] + filename = f"tmp/{output_format}.zip" + # Export to TEI + if output_format == "TEI": + generate_tei(corpora_list, output_path, True) + return {out_file: gr.File.update(value=f"tmp/{output_format}.zip", + label=output_format)} + # Export to JSON + if output_granularity == "poem": + export_corpora(get_ids(corpora_list), + None, + "tmp/JSON", + None, + no_download=False, + ui_mode=True) + with ZipFile(filename, "w") as zfile: + for corp in corpora_list: + p = Path(f'{output_path}/{output_format}/{corp}') + for f in p.glob("**/*.json"): + zfile.write(f) + return {out_file: gr.File.update(value=f"tmp/{output_format}.zip", + label=output_format)} + else: + json_l, filename = export_corpora(get_ids(corpora_list), + output_granularity, + "tmp/JSON", + None, + no_download=False, + ui_mode=True) + with open(f"tmp/{filename}.json", 'w', encoding='utf-8') as f: + json.dump(json_l, f, ensure_ascii=False, indent=4) + return {out_file: gr.File.update(value=f"tmp/{filename}.json", + label=filename)} + + + def block_granularity(r_format): + if r_format == "TEI": + PARAMS["output_format"] = r_format + return {rad_granularity: gr.update(value="poem", visible=False)} + else: + PARAMS["output_format"] = r_format + return {rad_granularity: gr.update(value="poem", visible=True)} + def update_granularity(value): + PARAMS["granularity"] = value + true_l = [gr.Checkbox.update(value=True, visible=True)] + false_l = [gr.Checkbox.update(value=False, visible=False)] # False + true_acc = [gr.Box.update(visible=True)] + false_acc = [gr.Box.update(visible=False)] # False + if value == "word": + # 000011100110 + 110110 + 110110 + corp_list = false_l*4 + true_l*3 + false_l*2 + true_l*2 + false_l + lang_list = true_l*2 + false_l + true_l*2 + false_l + acc_list = true_acc*2 + false_acc + true_acc*2 + false_acc + PARAMS["corpora_list"] = ["plc", "gongo", "ecpa", "bibit", "czverse"] + return corp_list + lang_list + acc_list + elif value == "syllable": + # 000011000000 + 100000 + 100000 + corp_list = false_l*4 + true_l*2 + false_l*6 + lang_list = true_l + false_l*5 + acc_list = true_acc + false_acc*5 + PARAMS["corpora_list"] = ["plc", "gongo"] + return corp_list + lang_list + acc_list + else: + return true_l*18 + true_acc*6 + + + def change_selection(value, *labels): + for corpus_name in labels: + update_corpora_list(value, corpus_name) + return value + def change_global_selection(value, *labels): + for corpus_name in labels: + update_corpora_list(value, corpus_name) + return [gr.Checkbox.update(value=value)] * 6 + def update_corpora_list(added, corpus_name): + corpora_list = PARAMS["corpora_list"] + if corpus_name in corpora_list and not added: + corpora_list.remove(corpus_name) + elif corpus_name not in corpora_list and added: + corpora_list.append(corpus_name) + + app_title = gr.HTML("

Averell

") + with gr.Row() as row: + with gr.Column(scale=3) as c1: + rad_format = gr.Radio(["TEI", "JSON"], + label="Output", + info="Choose output format", + value="TEI", + interactive=True) + rad_granularity = gr.Radio( + ["poem", "stanza", "line", "word", "syllable"], + label="Granularity", + info="Choose output granularity", + value="poem", + interactive=True, + visible=False, + ) + corpus_checkboxes = [] + lang_checkboxes = [] + with gr.Box() as b1: + gr.HTML(value="

Corpora list

") + gr.HTML(value="
") + all_corp_chk = gr.Checkbox(True, label="Select all/none", + interactive=True) + with gr.Row() as rowa: + all_label_list = [] + gr.Checkbox(label="dummy",visible=False) + for lang in available_languages: + with gr.Column() as b2: + language = pycountry.languages.get( + alpha_2=lang).name + gr.HTML(language) + with gr.Blocks() as corpora: + with gr.Row() as rowb: + lang_chk = gr.Checkbox(True, + label="Select all/none", + interactive=True) + filtered_corpus = filter_corpus_language( + CORPORA_SOURCES, lang) + with gr.Accordion("Expand list", + open=False) as acc: + label_list = [] + for corpus in filtered_corpus: + classes = corpus["properties"][ + "granularity"] + classes.append("poem") + classes.append(lang) + chk = gr.Checkbox(True, + label=corpus[ + "name"], + info=f'License: {corpus["properties"]["license"]} | Number of poems: {corpus["properties"]["doc_quantity"]}', + interactive=True, + elem_classes=classes, + elem_id=corpus[ + "properties"][ + "slug"], + ) + label = gr.Textbox( + value=corpus["properties"][ + "slug"], visible=False) + # Corpus checkboxes change + chk.change(update_corpora_list, + [chk, label], + show_progress=False) + # "Select all" language checkboxes change + lang_chk.change(change_selection, + [lang_chk, + *label_list], + chk, + show_progress=False) + corpus_checkboxes.append(chk) + label_list.append(label) + all_label_list.append(label) + lang_checkboxes.append(lang_chk) + # "Select All/None" checkbox change + all_corp_chk.change(change_global_selection, + [all_corp_chk, + *all_label_list], + [*lang_checkboxes]) + with gr.Column(scale=1) as c2: + accordions_boxes = rowa.children[1:] + rad_granularity.change(update_granularity, + rad_granularity, + [*corpus_checkboxes, + *lang_checkboxes, + *accordions_boxes], + show_progress=False) + rad_format.change(block_granularity, + rad_format, + rad_granularity, + api_name="output_format", + show_progress=False) + exp_btn = gr.Button("Export") + folder_path = gr.Label(value="tmp/", visible=False) + out_file = gr.File() + exp_btn.click(export, + [folder_path, + rad_format, + rad_granularity], + out_file, + api_name="export") + +app_averell.launch(share=False, server_port=5741) diff --git a/requirements.txt b/requirements.txt index ffc9977..c4a2bfa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,8 @@ pyyaml>=5 requests>=2.21.0 tabulate>=0.8.7 tqdm>=4.43.0 + +averell~=1.2.2 +setuptools~=67.7.2 +gradio~=3.27.0 +pycountry~=22.3.5 diff --git a/setup.py b/setup.py index 6faa4e0..095e1de 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ def read(*names, **kwargs): keywords=[ # eg: 'keyword1', 'keyword2', 'keyword3', ], - python_requires='>3.6.*', + python_requires='>=3.6', install_requires=read("requirements.txt").split("\n"), extras_require={ # eg: diff --git a/src/averell/core.py b/src/averell/core.py index 423b4ba..7d27004 100644 --- a/src/averell/core.py +++ b/src/averell/core.py @@ -49,7 +49,8 @@ def get_corpora(corpus_indices=None, output_folder=DEFAULT_OUTPUT_FOLDER): def export_corpora( - corpus_ids, granularity, corpora_folder, filename, no_download=False + corpus_ids, granularity, corpora_folder, filename, no_download=False, + ui_mode=False ): """ Generates a single JSON file with the chosen granularity for all of the @@ -60,6 +61,8 @@ def export_corpora( :param corpora_folder: Local folder where the corpora is located :param filename: Name of the output file :param no_download: Whether to download or not a corpora when missing + :param ui_mode: Whether the function is called from the gradio UI + :return: Python dict with the chosen granularity for all of the selected corpora """ @@ -68,6 +71,8 @@ def export_corpora( export_filename = filename if Path(corpora_folder).exists() or not no_download: if not corpus_ids: + print("ID not in corpora list") + logging.error("No CORPUS ID selected") else: if granularity is not None: @@ -75,6 +80,7 @@ def export_corpora( try: corpus = CORPORA_SOURCES[corpus_id] except IndexError: + print("ID not in corpora list") logging.error("ID not in corpora list") else: corpus_folder = corpus["properties"]["slug"] @@ -90,6 +96,8 @@ def export_corpora( continue granularities_list = corpus["properties"]["granularity"] if granularity not in granularities_list: + print("ID not in corpora list") + logging.error( f"'{granularity}' granularity not found on " f"'{corpus_name}' properties") @@ -101,14 +109,17 @@ def export_corpora( granularity) corpora_features.extend(filtered_features) else: + print("ID not in corpora list") + logging.error("No GRANULARITY selected") if not export_filename: export_filename = "_".join(slugs) export_filename = f"{export_filename}_{granularity}s" - if corpora_features: + if corpora_features and not ui_mode: write_json(corpora_features, export_filename) else: + print("Corpora folder not found") logging.error("Corpora folder not found") return corpora_features, export_filename diff --git a/src/averell/utils.py b/src/averell/utils.py index 125b27b..cd9f029 100644 --- a/src/averell/utils.py +++ b/src/averell/utils.py @@ -2,6 +2,8 @@ import logging import os import urllib.request +import xml.etree.ElementTree as et + from pathlib import Path from zipfile import ZipFile @@ -17,6 +19,20 @@ TEI_NAMESPACE = "{http://www.tei-c.org/ns/1.0}" XML_NS = "{http://www.w3.org/XML/1998/namespace}" +CORPUS_NAMES = { + "disco2_1": "Disco V2.1", + "disco3": "Disco V3", + "adso": "Sonetos Siglo de Oro", + "adso100": "ADSO 100 poems corpus", + "plc": "Poesía Lírica Castellana Siglo de Oro", + "gongo": "Gongocorpus", + "ecpa": "Eighteenth Century Poetry Archive", + "4b4v": "For Better For Verse", + "mel": "Métrique en Ligne", + "bibit": "Biblioteca Italiana", + "czverse": "Corpus of Czech Verse", + "stichopt": "Stichotheque Portuguese", +} def progress_bar(t): """ from https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5 @@ -347,3 +363,125 @@ def get_ids(values): or props["language"] in values): ids.append(index) return ids + + +def generate_tei(corpora_list, output_path, ui_enabled=True): + filename_list = [] + for corpus in corpora_list: + if ui_enabled: + p = Path(output_path) / "tmp_corp" / corpus / "averell" / "parser" + else: + p = Path(output_path) / corpus / "averell" / "parser" + poem_path_list = p.glob("**/*.json") + for poem_path in poem_path_list: + with open(poem_path, "r") as poem_file: + poem = json.load(poem_file) + file_name = poem_path.stem + filename_list.append(file_name) + poem_title = poem["poem_title"] + author = poem["author"] + corpus_name = CORPUS_NAMES[poem["corpus"]] + manually_checked = poem["manually_checked"] + + poem_id = f"{author}_{file_name}" + + root = et.Element("TEI") + header = et.SubElement(root, "teiHeader") + + file_desc = et.SubElement(header, "fileDesc") + + title_stmt = et.SubElement(file_desc, "titleStmt") + title_stmt_desc = et.SubElement(title_stmt, "title") + author_stmt_desc = et.SubElement(title_stmt, "author") + title_stmt_desc.text = poem_title + author_stmt_desc.text = author + + extent = et.SubElement(file_desc, "extent") + + pub_stmt = et.SubElement(file_desc, "publicationStmt") + publisher = et.SubElement(pub_stmt, "publisher") + publisher.text = "UNED University" + idno = et.SubElement(pub_stmt, "idno") + idno.text = poem_id + availability = et.SubElement(pub_stmt, "availability") + availability.attrib["status"] = "free" + p = et.SubElement(availability, "p") + p.text = "The text is freely available." + + series_stmt = et.SubElement(file_desc, "seriesStmt") + title_series = et.SubElement(series_stmt, "title") + title_series.text = corpus_name + + source_desc = et.SubElement(file_desc, "sourceDesc") + bibl_source = et.SubElement(source_desc, "bibl") + bibl_title = et.SubElement(bibl_source, "title") + bibl_title.text = poem_title + bibl_author = et.SubElement(bibl_source, "author") + bibl_author.text = author + + measure_st = et.SubElement(extent, "measure") + measure_st.attrib["unit"] = "stanza" + measure_st.text = str(len(poem["stanzas"])) + n_lines = 0 + + text_poem = et.SubElement(root, "text") + front_poem = et.SubElement(text_poem, "front") + head_poem = et.SubElement(front_poem, "head") + head_poem.text = poem_title + body_poem = et.SubElement(text_poem, "body") + + lg_main = et.SubElement(body_poem, "lg") + lg_main.attrib["xmlns"] = "http://www.tei-c.org/ns/1.0" + lg_main.attrib["type"] = "poem" + + for stanza in poem["stanzas"]: + + n_lines += len(stanza["lines"]) + + stanza_number = stanza["stanza_number"] + stanza_type = stanza.get("stanza_type") + lg = et.SubElement(lg_main, "lg") + lg.attrib["n"] = str(stanza_number) + if stanza_type: + lg.attrib["stanza_type"] = stanza_type + for line in stanza["lines"]: + l = et.SubElement(lg, "l") + l.text = line["line_text"] + l.attrib["n"] = str(line["line_number"]) + + metrical_pattern = line.get("metrical_pattern") + rhyme = line.get("rhyme") + line_length = line.get("line_length") + + if metrical_pattern: + l.attrib["met"] = str(metrical_pattern) + if rhyme: + l.attrib["rhyme"] = str(rhyme) + if line_length: + l.attrib["line_length"] = str(line_length) + + measure_l = et.SubElement(extent, "measure") + measure_l.attrib["unit"] = "line" + measure_l.text = str(n_lines) + tree = et.ElementTree(root) + + # output_path = Path('corpora') / f'{poem["corpus"]}' / 'averell' / 'TEI' + output_base_path = Path(output_path) / 'TEI' + output_extended_path = output_base_path / poem["corpus"] / author + + # prefix = '{:05d}'.format(filename_list.count(file_name)) + output_file = f"{poem_id}.xml" + if not os.path.exists(output_base_path): + Path.mkdir(output_base_path) + if not os.path.exists(output_extended_path): + Path.mkdir(output_extended_path, parents=True) + et.indent(tree, space=" ", level=0) + tree.write(f"{Path(output_extended_path) / output_file}", + encoding="UTF-8", + xml_declaration=True) + if ui_enabled: + with ZipFile("tmp/TEI.zip", "w") as zfile: + for corpus in corpora_list: + p = Path(f'tmp/TEI/{corpus}') + for f in p.glob("*/**/*.xml"): + zfile.write(f)