Skip to content

Commit f31f603

Browse files
authored
Merge pull request #18 from bricksdont/fix_dgs_types
Fix dgs types
2 parents 1f79860 + b78a43a commit f31f603

File tree

3 files changed

+44
-3
lines changed

3 files changed

+44
-3
lines changed

examples/load.ipynb

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,36 @@
300300
"execution_count": null,
301301
"outputs": []
302302
},
303+
{
304+
"cell_type": "markdown",
305+
"source": [
306+
"# DGS Types"
307+
],
308+
"metadata": {
309+
"collapsed": false,
310+
"pycharm": {
311+
"name": "#%% md\n"
312+
}
313+
}
314+
},
315+
{
316+
"cell_type": "code",
317+
"execution_count": null,
318+
"outputs": [],
319+
"source": [
320+
"config = SignDatasetConfig(name=\"only-annotations\", version=\"1.0.0\", include_video=False, include_pose=None, process_video=False)\n",
321+
"dgs_types = tfds.load('dgs_types', builder_kwargs=dict(config=config))\n",
322+
"\n",
323+
"for datum in itertools.islice(dgs_types[\"train\"], 0, 10):\n",
324+
" print(datum)"
325+
],
326+
"metadata": {
327+
"collapsed": false,
328+
"pycharm": {
329+
"name": "#%%\n"
330+
}
331+
}
332+
},
303333
{
304334
"cell_type": "markdown",
305335
"source": [

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
setup(
1212
name="sign-language-datasets",
1313
packages=packages,
14-
version="0.1.0",
14+
version="0.1.1",
1515
description="TFDS Datasets for sign language",
1616
author="Amit Moryossef",
1717
author_email="amitmoryossef@gmail.com",

sign_language_datasets/datasets/dgs_types/dgs_types.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import re
44
from collections import defaultdict
55

6+
import tensorflow as tf
67
import tensorflow_datasets as tfds
78

89
from os import path
@@ -79,6 +80,7 @@ def _info(self) -> tfds.core.DatasetInfo:
7980
features = {
8081
"id": tfds.features.Text(),
8182
"glosses": tfds.features.Sequence(tfds.features.Text()),
83+
"frequencies": tfds.features.Sequence(tf.int32),
8284
"hamnosys": tfds.features.Text(),
8385
"views": tfds.features.Sequence(video_feature)
8486
}
@@ -112,6 +114,7 @@ def get_galex_data(self, dl_manager: tfds.download.DownloadManager):
112114
datum = {
113115
"id": "galex_" + gloss,
114116
"glosses": [gloss],
117+
"frequencies": [],
115118
"hamnosys": re.findall(r'a class=\"ham\".*?>(.*?)<', content)[0],
116119
"views": [{
117120
"name": "front",
@@ -131,11 +134,16 @@ def get_galex_data(self, dl_manager: tfds.download.DownloadManager):
131134
def get_dgs_data(self, dl_manager: tfds.download.DownloadManager):
132135
MEINE_DGS = "https://www.sign-lang.uni-hamburg.de/meinedgs/"
133136
dgs_index = dl_manager.download(MEINE_DGS + "ling/types_de.html")
137+
134138
gloss_map = defaultdict(list)
139+
gloss_frequencies = defaultdict(list)
140+
135141
with open(dgs_index, "r", encoding="utf-8") as f:
136-
for match in re.finditer(r'<p>(.*?) \(\d* Tokens\)( → )?(.*?)</p>', f.read()):
142+
for match in re.finditer(r'<p>(.*?) \((\d+) Tokens?\)( → )?(.*?)</p>', f.read()):
137143
gloss_id = re.findall(r'\.\.\/types\/(.*?)\.html', match.group(0))[0]
138-
gloss_text = match.group(1) if match.group(3) != "" else re.findall(r'>(.*?)<', match.group(1))[0]
144+
gloss_frequency = int(match.group(2))
145+
gloss_frequencies[gloss_id].append(gloss_frequency)
146+
gloss_text = match.group(1) if match.group(3) is not None else re.findall(r'>(.*?)<', match.group(1))[0]
139147
gloss_map[gloss_id].append(gloss_text)
140148

141149
gloss_ids = list(gloss_map.keys())
@@ -162,11 +170,14 @@ def get_dgs_data(self, dl_manager: tfds.download.DownloadManager):
162170
})
163171
video_urls[view_video_url] = view_video_url
164172

173+
frequencies = gloss_frequencies[gloss_id]
174+
165175
hamnosys_search = re.findall(r'class=\"hamnosys\".*?>(.*?)<', content)
166176
hamnosys = hamnosys_search[0] if len(hamnosys_search) > 0 else ""
167177

168178
data.append({
169179
"id": gloss_id,
180+
"frequencies": frequencies,
170181
"glosses": glosses,
171182
"hamnosys": hamnosys,
172183
"views": views

0 commit comments

Comments
 (0)