Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion bsmetadata/metadata_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,13 @@ class HtmlProcessor(MetadataProcessor):
def process_local(self, metadata_attrs: Dict[str, Any]) -> Optional[Tuple[str, str]]:
# We represent a html tag `T` by enclosing the corresponding text span with "<T>" and "</T>".
# Example: An <b>apple</b> is an edible fruit.
return f"<{metadata_attrs['value']}>", f"</{metadata_attrs['value']}>"
attributes = " ".join(
f'{attr}:"{value}"'
for attr, value in zip(metadata_attrs["value"]["attrs"]["attr"], metadata_attrs["value"]["attrs"]["value"])
)
if attributes:
attributes = " " + attributes
return f"<{metadata_attrs['value']['tag']}{attributes}>", f"</{metadata_attrs['value']['tag']}>"


class UrlProcessor(MetadataProcessor):
Expand Down
81 changes: 81 additions & 0 deletions tests/test_metadata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from transformers import GPT2TokenizerFast

from bsmetadata.input_pipeline import DataConfig
from bsmetadata.metadata_processors import PROCESSORS, HtmlProcessor, MetadataProcessor
from bsmetadata.metadata_processors import PROCESSORS, MetadataProcessor
from bsmetadata.metadata_utils import (
add_local_metadata_to_text,
Expand Down Expand Up @@ -57,6 +58,76 @@ def setUp(self) -> None:
{"key": "url", "type": "global", "value": "callto:RickAndMorty/Year%202021/"},
],
},
{
"id": "0004",
"text": "useless text The Walking Dead (season 8)\n",
"metadata": [
{
"char_start_idx": 13,
"value": {
"tag": "h1",
"attrs": {"attr": [], "value": []},
},
"char_end_idx": 40,
"key": "html",
"type": "local",
},
{
"char_start_idx": 13,
"value": {
"tag": "div",
"attrs": {"attr": [], "value": []},
},
"char_end_idx": 13,
"key": "html",
"type": "local",
},
{
"char_start_idx": 0,
"value": {"tag": "a", "attrs": {"attr": [], "value": []}},
"char_end_idx": 13,
"key": "html",
"type": "local",
},
{
"char_start_idx": 13,
"value": {
"tag": "div",
"attrs": {"attr": [], "value": []},
},
"char_end_idx": 13,
"key": "html",
"type": "local",
},
{
"char_start_idx": 13,
"value": {
"tag": "a",
"attrs": {"attr": [], "value": []},
},
"char_end_idx": 13,
"key": "html",
"type": "local",
},
{
"char_start_idx": 13,
"value": {
"tag": "div",
"attrs": {"attr": [], "value": []},
},
"char_end_idx": 13,
"key": "html",
"type": "local",
},
{
"char_start_idx": 13,
"value": {"tag": "i", "attrs": {"attr": [], "value": []}},
"char_end_idx": 29,
"key": "html",
"type": "local",
},
],
},
]

def test_chunks(self):
Expand Down Expand Up @@ -133,6 +204,16 @@ def test_add_no_metadata_and_chunk_examples(self):
for example in mapped_ds:
self.assertTrue(all(not x for x in example["metadata_mask"]))

def test_add_html_tags(self):
cfg = DataConfig()
cfg.metadata_list = ["html"]
PROCESSORS["html"] = HtmlProcessor

text1, mask1 = add_local_metadata_to_text(self.examples[3], cfg)
target_text = '<a>useless text </a><div><a><div><div></div></div></a></div><h1><i>The Walking Dead</i> (season 8)</h1>\n'
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before the change I propose in this PR the result was here:

<a>useless text </div></a></div></a></div><h1><i><div><a><div><div>The Walking Dead</i> (season 8)</h1>


self.assertEqual(text1, target_text)

def test_add_metadata_and_chunk_examples(self):
cfg = DataConfig()
cfg.metadata_list = ["url", "timestamp", "html", "entity"]
Expand Down