Skip to content

Commit a4b29c0

Browse files
committed
feat: added Grand Débat JSON files
1 parent 3a97f78 commit a4b29c0

File tree

5 files changed

+271
-0
lines changed

5 files changed

+271
-0
lines changed

docs/source/api/debate.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
Debate API
3+
==========
4+
5+
Contributions
6+
-------------
7+
8+
Individual contributions
9+
10+
.. autoxpmconfig:: datamaestro_text.data.debate.granddebat.GrandDebatFile
11+
:members:

docs/source/api/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ Datamaestro Text API
99
embeddings
1010
recommendation
1111
nlp
12+
debate
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
# See documentation on https://datamaestro.readthedocs.io
2+
3+
from pathlib import Path
4+
from datamaestro.definitions import datatags, dataset
5+
from datamaestro_text.data.debate import GrandDebatFile
6+
from datamaestro.download.single import filedownloader
7+
from datamaestro.utils import HashCheck
8+
from datamaestro.stream import Transform
9+
import io
10+
import json
11+
import ijson
12+
import os
13+
import threading
14+
15+
16+
class JsonToJsonl(Transform):
17+
"""Transforms a JSON file with an array into a JSONL file with one line per
18+
array element"""
19+
20+
def __call__(self, fileobj: io.IOBase) -> io.IOBase:
21+
# Stream items from the top-level array into a read-end pipe.
22+
try:
23+
fileobj.seek(0)
24+
except Exception:
25+
pass
26+
27+
r_fd, w_fd = os.pipe()
28+
r_file = os.fdopen(r_fd, "rb")
29+
w_file = os.fdopen(w_fd, "wb")
30+
31+
def _writer(fin, fout):
32+
try:
33+
for item in ijson.items(fin, "item"):
34+
line = json.dumps(item, ensure_ascii=False) + "\n"
35+
fout.write(line.encode("utf-8"))
36+
fout.close()
37+
except Exception:
38+
try:
39+
fout.close()
40+
except Exception:
41+
pass
42+
43+
t = threading.Thread(target=_writer, args=(fileobj, w_file), daemon=True)
44+
t.start()
45+
46+
return r_file
47+
48+
49+
@filedownloader(
50+
"la_transition_ecologique_2019_03_21.jsonl",
51+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_TRANSITION_ECOLOGIQUE.json",
52+
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
53+
transforms=JsonToJsonl(),
54+
)
55+
@datatags("politics", "debate", "french")
56+
@dataset(
57+
GrandDebatFile,
58+
url="https://granddebat.fr",
59+
)
60+
def transition(la_transition_ecologique_2019_03_21: Path):
61+
"""Grand Débat National (transition écologique)
62+
63+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
64+
in France in 2019.
65+
66+
67+
The consultation prompted citizens to express their views across four main
68+
themes: *Taxation and public spending*, *Organization of the state and
69+
public services*, *Democracy and citizenship*, and *Ecological transition*.
70+
A significant portion of this consultation involved online questionnaires,
71+
each concluding with a critical open-ended prompt: "Do you have anything to
72+
add about [theme]?".
73+
"""
74+
return GrandDebatFile.C(path=la_transition_ecologique_2019_03_21)
75+
76+
77+
@filedownloader(
78+
"fiscalité_et_dépenses_publiques_2019_03_21.jsonl",
79+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json",
80+
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
81+
transforms=JsonToJsonl(),
82+
)
83+
@datatags("politics", "debate", "french")
84+
@dataset(
85+
GrandDebatFile,
86+
url="https://granddebat.fr",
87+
)
88+
def fiscalité(fiscalité_et_dépenses_publiques_2019_03_21: Path):
89+
"""Grand Débat National (fiscalité et dépenses publiques)
90+
91+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
92+
in France in 2019.
93+
94+
95+
The consultation prompted citizens to express their views across four main
96+
themes: *Taxation and public spending*, *Organization of the state and
97+
public services*, *Democracy and citizenship*, and *Ecological transition*.
98+
A significant portion of this consultation involved online questionnaires,
99+
each concluding with a critical open-ended prompt: "Do you have anything to
100+
add about [theme]?".
101+
"""
102+
return GrandDebatFile.C(path=fiscalité_et_dépenses_publiques_2019_03_21)
103+
104+
105+
@filedownloader(
106+
"democratie_et_citoyennete_2019_03_21.jsonl",
107+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/DEMOCRATIE_ET_CITOYENNETE.json",
108+
checker=HashCheck("049aaeca7e51747f2da5d68143c760fa"),
109+
transforms=JsonToJsonl(),
110+
)
111+
@datatags("politics", "debate", "french")
112+
@dataset(
113+
GrandDebatFile,
114+
url="https://granddebat.fr",
115+
)
116+
def démocratie(democratie_et_citoyennete_2019_03_21: Path):
117+
"""Grand Débat National (démocratie et citoyenneté)
118+
119+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
120+
in France in 2019.
121+
122+
123+
The consultation prompted citizens to express their views across four main
124+
themes: *Taxation and public spending*, *Organization of the state and
125+
public services*, *Democracy and citizenship*, and *Ecological transition*.
126+
A significant portion of this consultation involved online questionnaires,
127+
each concluding with a critical open-ended prompt: "Do you have anything to
128+
add about [theme]?".
129+
"""
130+
return GrandDebatFile.C(path=democratie_et_citoyennete_2019_03_21)
131+
132+
133+
@filedownloader(
134+
"organisation_etat_services_publics_2019_03_21.jsonl",
135+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json",
136+
checker=HashCheck("0ccb7c401889f738b73b0caab897a68b"),
137+
transforms=JsonToJsonl(),
138+
)
139+
@datatags("politics", "debate", "french")
140+
@dataset(
141+
GrandDebatFile,
142+
url="https://granddebat.fr",
143+
)
144+
def organisation(organisation_etat_services_publics_2019_03_21: Path):
145+
"""Grand Débat National (organisation de l'État et des services publics)
146+
147+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
148+
in France in 2019.
149+
150+
151+
The consultation prompted citizens to express their views across four main
152+
themes: *Taxation and public spending*, *Organization of the state and
153+
public services*, *Democracy and citizenship*, and *Ecological transition*.
154+
A significant portion of this consultation involved online questionnaires,
155+
each concluding with a critical open-ended prompt: "Do you have anything to
156+
add about [theme]?".
157+
"""
158+
return GrandDebatFile.C(path=organisation_etat_services_publics_2019_03_21)
159+
160+
161+
@filedownloader(
162+
"les_evenements_2019_03_21.jsonl",
163+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LES_EVENEMENTS.json",
164+
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
165+
transforms=JsonToJsonl(),
166+
)
167+
@datatags("politics", "debate", "french")
168+
@dataset(
169+
GrandDebatFile,
170+
url="https://granddebat.fr",
171+
)
172+
def evenements(les_evenements_2019_03_21: Path):
173+
"""Grand Débat National (événements)
174+
175+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
176+
in France in 2019.
177+
178+
179+
The consultation prompted citizens to express their views across four main
180+
themes: *Taxation and public spending*, *Organization of the state and
181+
public services*, *Democracy and citizenship*, and *Ecological transition*.
182+
A significant portion of this consultation involved online questionnaires,
183+
each concluding with a critical open-ended prompt: "Do you have anything to
184+
add about [theme]?".
185+
"""
186+
return GrandDebatFile.C(path=les_evenements_2019_03_21)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"""Data classes for debate datasets"""
2+
3+
from .granddebat import GrandDebatEntry, GrandDebatFile, GrandDebatResponse
4+
5+
__all__ = ["GrandDebatEntry", "GrandDebatFile", "GrandDebatResponse"]
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""Data classes for the Grand Débat National dataset"""
2+
3+
import json
4+
from dataclasses import dataclass, field
5+
from typing import Iterator, List, Optional
6+
7+
from datamaestro.data import File
8+
9+
10+
@dataclass
11+
class GrandDebatResponse:
12+
"""A response to a question in the Grand Débat National"""
13+
14+
question_id: str
15+
question_title: str
16+
value: Optional[str]
17+
formatted_value: Optional[str]
18+
19+
20+
@dataclass
21+
class GrandDebatEntry:
22+
"""An entry (contribution) in the Grand Débat National dataset"""
23+
24+
id: str
25+
reference: str
26+
title: str
27+
created_at: str
28+
published_at: str
29+
updated_at: Optional[str]
30+
trashed: bool
31+
trashed_status: Optional[str]
32+
author_id: str
33+
author_type: str
34+
author_zip_code: str
35+
responses: List[GrandDebatResponse] = field(default_factory=list)
36+
37+
38+
class GrandDebatFile(File):
39+
"""A Grand Débat National JSONL file with iteration support"""
40+
41+
def __iter__(self) -> Iterator[GrandDebatEntry]:
42+
"""Iterate over entries in the JSONL file"""
43+
with self.path.open("r", encoding="utf-8") as f:
44+
for line in f:
45+
data = json.loads(line)
46+
responses = [
47+
GrandDebatResponse(
48+
question_id=r["questionId"],
49+
question_title=r["questionTitle"],
50+
value=r.get("value"),
51+
formatted_value=r.get("formattedValue"),
52+
)
53+
for r in data.get("responses", [])
54+
]
55+
yield GrandDebatEntry(
56+
id=data["id"],
57+
reference=data["reference"],
58+
title=data["title"],
59+
created_at=data["createdAt"],
60+
published_at=data["publishedAt"],
61+
updated_at=data.get("updatedAt"),
62+
trashed=data["trashed"],
63+
trashed_status=data.get("trashedStatus"),
64+
author_id=data["authorId"],
65+
author_type=data["authorType"],
66+
author_zip_code=data["authorZipCode"],
67+
responses=responses,
68+
)

0 commit comments

Comments
 (0)