Skip to content

Commit bac4a9f

Browse files
committed
spread lexeme feature values to occurrences
1 parent 5d450ce commit bac4a9f

File tree

6 files changed

+1315608
-5
lines changed

6 files changed

+1315608
-5
lines changed

docs/news.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,15 @@ title: News
33
type: pages
44
---
55

6+
### 2019-01-31
7+
8+
Some features only had values for lexeme nodes: `gloss nametype voc_lex voc_lex_utf8`.
9+
If you want to know the values for individual words, you can easily go from a lexeme node
10+
down to its occurrences with the `L.d()` function. That is, if you are programming.
11+
12+
But if you are querying, templates tend to become cumbersome because of this.
13+
So I added the values of these features for lexemes to all of their occurrences.
14+
615
### 2018-01-17
716

817
* There has been a conversion error: a single lexeme node became tied to a single stray node,

programs/spreadLexFeatures.ipynb

Lines changed: 274 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,281 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"This notebook looks"
7+
"This notebook looks for all features on lexeme nodes and spread their values over all its occurences, if it is not already done."
88
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 1,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import os\n",
17+
"import collections\n",
18+
"\n",
19+
"from tf.fabric import Fabric"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": 2,
25+
"metadata": {},
26+
"outputs": [],
27+
"source": [
28+
"BASE = os.path.expanduser('~/github')\n",
29+
"ORG = 'etcbc'\n",
30+
"REPO = 'bhsa'\n",
31+
"VERSION = 'c'\n",
32+
"\n",
33+
"REPO_PATH = f'{BASE}/{ORG}/{REPO}'\n",
34+
"TF_IN = f'{REPO_PATH}/tf/{VERSION}'\n",
35+
"TF_OUT = f'{REPO_PATH}/_temp/lex/{VERSION}'"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 7,
41+
"metadata": {},
42+
"outputs": [],
43+
"source": [
44+
"lexFeatures = '''\n",
45+
" gloss\n",
46+
" nametype\n",
47+
" voc_lex\n",
48+
" voc_lex_utf8\n",
49+
"'''.strip().split()"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": 16,
55+
"metadata": {},
56+
"outputs": [],
57+
"source": [
58+
"generic = dict(\n",
59+
" author='Eep Talstra Centre for Bible and Computer',\n",
60+
" dataset='BHSA',\n",
61+
" datasetName='Biblia Hebraica Stuttgartensia Amstelodamensis',\n",
62+
" email='shebanq@ancient-data.org',\n",
63+
" encoders='Constantijn Sikkel (QDF), and Dirk Roorda (TF)',\n",
64+
" version='c',\n",
65+
" website='https://shebanq.ancient-data.org',\n",
66+
")"
67+
]
68+
},
69+
{
70+
"cell_type": "code",
71+
"execution_count": 17,
72+
"metadata": {},
73+
"outputs": [],
74+
"source": [
75+
"featureMeta = {feat: dict(valueType='str') for feat in lexFeatures}"
76+
]
77+
},
78+
{
79+
"cell_type": "code",
80+
"execution_count": 18,
81+
"metadata": {},
82+
"outputs": [
83+
{
84+
"data": {
85+
"text/plain": [
86+
"{'': {'author': 'Eep Talstra Centre for Bible and Computer',\n",
87+
" 'dataset': 'BHSA',\n",
88+
" 'datasetName': 'Biblia Hebraica Stuttgartensia Amstelodamensis',\n",
89+
" 'email': 'shebanq@ancient-data.org',\n",
90+
" 'encoders': 'Constantijn Sikkel (QDF), and Dirk Roorda (TF)',\n",
91+
" 'version': 'c',\n",
92+
" 'website': 'https://shebanq.ancient-data.org'},\n",
93+
" 'gloss': {'valueType': 'str'},\n",
94+
" 'nametype': {'valueType': 'str'},\n",
95+
" 'voc_lex': {'valueType': 'str'},\n",
96+
" 'voc_lex_utf8': {'valueType': 'str'}}"
97+
]
98+
},
99+
"execution_count": 18,
100+
"metadata": {},
101+
"output_type": "execute_result"
102+
}
103+
],
104+
"source": [
105+
"metaData = {'': generic}\n",
106+
"metaData.update(featureMeta)\n",
107+
"metaData"
108+
]
109+
},
110+
{
111+
"cell_type": "code",
112+
"execution_count": 19,
113+
"metadata": {},
114+
"outputs": [
115+
{
116+
"name": "stdout",
117+
"output_type": "stream",
118+
"text": [
119+
"This is Text-Fabric 7.4.4\n",
120+
"Api reference : https://annotation.github.io/text-fabric/Api/Fabric/\n",
121+
"\n",
122+
"114 features found and 0 ignored\n"
123+
]
124+
}
125+
],
126+
"source": [
127+
"TFin = Fabric(locations=TF_IN)"
128+
]
129+
},
130+
{
131+
"cell_type": "code",
132+
"execution_count": 20,
133+
"metadata": {},
134+
"outputs": [
135+
{
136+
"name": "stdout",
137+
"output_type": "stream",
138+
"text": [
139+
" 0.00s loading features ...\n",
140+
" | 0.01s B voc_lex_utf8 from /Users/dirk/github/etcbc/bhsa/tf/c\n",
141+
" | 0.01s B gloss from /Users/dirk/github/etcbc/bhsa/tf/c\n",
142+
" | 0.00s B nametype from /Users/dirk/github/etcbc/bhsa/tf/c\n",
143+
" | 0.01s B voc_lex from /Users/dirk/github/etcbc/bhsa/tf/c\n",
144+
" 3.58s All features loaded/computed - for details use loadLog()\n"
145+
]
146+
},
147+
{
148+
"data": {
149+
"text/plain": [
150+
"[('Computed',\n",
151+
" 'computed-data',\n",
152+
" ('C Computed', 'Call AllComputeds', 'Cs ComputedString')),\n",
153+
" ('Features', 'edge-features', ('E Edge', 'Eall AllEdges', 'Es EdgeString')),\n",
154+
" ('Fabric', 'loading', ('ensureLoaded', 'TF', 'ignored', 'loadLog')),\n",
155+
" ('Locality', 'locality', ('L Locality',)),\n",
156+
" ('Misc', 'messaging', ('cache', 'error', 'indent', 'info', 'reset')),\n",
157+
" ('Nodes',\n",
158+
" 'navigating-nodes',\n",
159+
" ('N Nodes', 'sortKey', 'sortKeyTuple', 'otypeRank', 'sortNodes')),\n",
160+
" ('Features',\n",
161+
" 'node-features',\n",
162+
" ('F Feature', 'Fall AllFeatures', 'Fs FeatureString')),\n",
163+
" ('Search', 'search', ('S Search',)),\n",
164+
" ('Text', 'text', ('T Text',))]"
165+
]
166+
},
167+
"execution_count": 20,
168+
"metadata": {},
169+
"output_type": "execute_result"
170+
}
171+
],
172+
"source": [
173+
"api = TFin.load(lexFeatures)\n",
174+
"api.makeAvailableIn(globals())"
175+
]
176+
},
177+
{
178+
"cell_type": "code",
179+
"execution_count": 24,
180+
"metadata": {},
181+
"outputs": [
182+
{
183+
"name": "stdout",
184+
"output_type": "stream",
185+
"text": [
186+
"gloss ...\n",
187+
"nametype ...\n",
188+
"voc_lex ...\n",
189+
"voc_lex_utf8 ...\n"
190+
]
191+
}
192+
],
193+
"source": [
194+
"nodeFeatures = collections.defaultdict(dict)\n",
195+
"\n",
196+
"for feat in lexFeatures:\n",
197+
" print(f'{feat} ...')\n",
198+
" for lx in F.otype.s('lex'):\n",
199+
" value = Fs(feat).v(lx)\n",
200+
" if value is not None:\n",
201+
" for w in L.d(lx, otype='word'):\n",
202+
" nodeFeatures[feat][w] = value\n",
203+
" nodeFeatures[feat][lx] = value"
204+
]
205+
},
206+
{
207+
"cell_type": "code",
208+
"execution_count": 25,
209+
"metadata": {},
210+
"outputs": [
211+
{
212+
"name": "stdout",
213+
"output_type": "stream",
214+
"text": [
215+
"This is Text-Fabric 7.4.4\n",
216+
"Api reference : https://annotation.github.io/text-fabric/Api/Fabric/\n",
217+
"\n",
218+
"0 features found and 0 ignored\n"
219+
]
220+
},
221+
{
222+
"name": "stderr",
223+
"output_type": "stream",
224+
"text": [
225+
" 0.00s Warp feature \"otype\" not found in\n",
226+
"/Users/dirk/github/etcbc/bhsa/_temp/lex/c/\n",
227+
" 0.00s Warp feature \"oslots\" not found in\n",
228+
"/Users/dirk/github/etcbc/bhsa/_temp/lex/c/\n"
229+
]
230+
},
231+
{
232+
"name": "stdout",
233+
"output_type": "stream",
234+
"text": [
235+
" 0.00s Warp feature \"otext\" not found. Working without Text-API\n",
236+
"\n"
237+
]
238+
}
239+
],
240+
"source": [
241+
"TFout = Fabric(locations=TF_OUT)"
242+
]
243+
},
244+
{
245+
"cell_type": "code",
246+
"execution_count": 27,
247+
"metadata": {},
248+
"outputs": [
249+
{
250+
"name": "stdout",
251+
"output_type": "stream",
252+
"text": [
253+
" 0.00s Exporting 4 node and 0 edge and 0 config features to /Users/dirk/github/etcbc/bhsa/_temp/lex/c:\n",
254+
" | 0.56s T gloss to /Users/dirk/github/etcbc/bhsa/_temp/lex/c\n",
255+
" | 0.05s T nametype to /Users/dirk/github/etcbc/bhsa/_temp/lex/c\n",
256+
" | 0.51s T voc_lex to /Users/dirk/github/etcbc/bhsa/_temp/lex/c\n",
257+
" | 0.56s T voc_lex_utf8 to /Users/dirk/github/etcbc/bhsa/_temp/lex/c\n",
258+
" 1.69s Exported 4 node features and 0 edge features and 0 config features to /Users/dirk/github/etcbc/bhsa/_temp/lex/c\n"
259+
]
260+
},
261+
{
262+
"data": {
263+
"text/plain": [
264+
"True"
265+
]
266+
},
267+
"execution_count": 27,
268+
"metadata": {},
269+
"output_type": "execute_result"
270+
}
271+
],
272+
"source": [
273+
"TFout.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData)"
274+
]
275+
},
276+
{
277+
"cell_type": "code",
278+
"execution_count": null,
279+
"metadata": {},
280+
"outputs": [],
281+
"source": []
9282
}
10283
],
11284
"metadata": {

0 commit comments

Comments
 (0)