33import re
44from collections import defaultdict
55
6+ import tensorflow as tf
67import tensorflow_datasets as tfds
78
89from os import path
@@ -79,6 +80,7 @@ def _info(self) -> tfds.core.DatasetInfo:
7980 features = {
8081 "id" : tfds .features .Text (),
8182 "glosses" : tfds .features .Sequence (tfds .features .Text ()),
83+ "frequencies" : tfds .features .Sequence (tf .int32 ),
8284 "hamnosys" : tfds .features .Text (),
8385 "views" : tfds .features .Sequence (video_feature )
8486 }
@@ -112,6 +114,7 @@ def get_galex_data(self, dl_manager: tfds.download.DownloadManager):
112114 datum = {
113115 "id" : "galex_" + gloss ,
114116 "glosses" : [gloss ],
117+ "frequencies" : [],
115118 "hamnosys" : re .findall (r'a class=\"ham\".*?>(.*?)<' , content )[0 ],
116119 "views" : [{
117120 "name" : "front" ,
@@ -131,11 +134,16 @@ def get_galex_data(self, dl_manager: tfds.download.DownloadManager):
131134 def get_dgs_data (self , dl_manager : tfds .download .DownloadManager ):
132135 MEINE_DGS = "https://www.sign-lang.uni-hamburg.de/meinedgs/"
133136 dgs_index = dl_manager .download (MEINE_DGS + "ling/types_de.html" )
137+
134138 gloss_map = defaultdict (list )
139+ gloss_frequencies = defaultdict (list )
140+
135141 with open (dgs_index , "r" , encoding = "utf-8" ) as f :
136- for match in re .finditer (r'<p>(.*?) \(\d* Tokens\)( → )?(.*?)</p>' , f .read ()):
142+ for match in re .finditer (r'<p>(.*?) \((\d+) Tokens? \)( → )?(.*?)</p>' , f .read ()):
137143 gloss_id = re .findall (r'\.\.\/types\/(.*?)\.html' , match .group (0 ))[0 ]
138- gloss_text = match .group (1 ) if match .group (3 ) != "" else re .findall (r'>(.*?)<' , match .group (1 ))[0 ]
144+ gloss_frequency = int (match .group (2 ))
145+ gloss_frequencies [gloss_id ].append (gloss_frequency )
146+ gloss_text = match .group (1 ) if match .group (3 ) is not None else re .findall (r'>(.*?)<' , match .group (1 ))[0 ]
139147 gloss_map [gloss_id ].append (gloss_text )
140148
141149 gloss_ids = list (gloss_map .keys ())
@@ -162,11 +170,14 @@ def get_dgs_data(self, dl_manager: tfds.download.DownloadManager):
162170 })
163171 video_urls [view_video_url ] = view_video_url
164172
173+ frequencies = gloss_frequencies [gloss_id ]
174+
165175 hamnosys_search = re .findall (r'class=\"hamnosys\".*?>(.*?)<' , content )
166176 hamnosys = hamnosys_search [0 ] if len (hamnosys_search ) > 0 else ""
167177
168178 data .append ({
169179 "id" : gloss_id ,
180+ "frequencies" : frequencies ,
170181 "glosses" : glosses ,
171182 "hamnosys" : hamnosys ,
172183 "views" : views
0 commit comments