Skip to content

Commit 53457c7

Browse files
committed
Add full Quran dataset import, validation, and seed workflow
1 parent ce1f83d commit 53457c7

File tree

8 files changed

+44112
-11
lines changed

8 files changed

+44112
-11
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ The format is inspired by Keep a Changelog.
1010

1111
- Open source governance docs (`CONTRIBUTING.md`, `VERSIONING.md`, `NOTICE.md`, `LICENSE`)
1212
- documentation requirements for Quran dataset integrity and attribution
13+
- full dataset pipeline scripts:
14+
- `scripts/import` (Tanzil text + metadata -> `data/quran.json`)
15+
- `scripts/validate` (`6236` count, uniqueness, field/range checks)
16+
- `scripts/seed_relations` (starter mutashabihat examples)
1317

1418
## [0.1.0] - 2026-03-15
1519

CONTRIBUTING.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ go run ./cmd/server
1616
```bash
1717
gofmt -w ./cmd ./internal
1818
go test ./...
19+
go run ./scripts/validate
20+
```
21+
22+
If your PR updates Quran text data, regenerate from source:
23+
24+
```bash
25+
go run ./scripts/import
26+
go run ./scripts/validate
1927
```
2028

2129
## Contribution Scope

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,26 @@ go run ./cmd/server
1717

1818
Server listens on `http://localhost:8080`.
1919

20+
## Quran Dataset Workflow
21+
22+
Import full dataset from Tanzil into `data/quran.json`:
23+
24+
```bash
25+
go run ./scripts/import
26+
```
27+
28+
Validate dataset integrity:
29+
30+
```bash
31+
go run ./scripts/validate
32+
```
33+
34+
Seed starter relation pairs:
35+
36+
```bash
37+
go run ./scripts/seed_relations
38+
```
39+
2040
## API
2141

2242
- `GET /api/ayah/{surah}/{ayah}`
@@ -50,3 +70,7 @@ curl -X POST http://localhost:8080/api/relations \
5070
- `web/templates` server-rendered pages
5171
- `web/static` CSS
5272
- `data/quran.json` local Quran dataset
73+
- `data/relations.seed.json` starter relation pairs
74+
- `scripts/import` imports full Quran text + metadata from Tanzil
75+
- `scripts/validate` validates dataset contract
76+
- `scripts/seed_relations` seeds initial mutashabihat relation examples

data/quran.json

Lines changed: 43635 additions & 11 deletions
Large diffs are not rendered by default.

data/relations.seed.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[
2+
{"ayah1": "60:8", "ayah2": "60:9", "note": "mutashabihat"},
3+
{"ayah1": "2:2", "ayah2": "3:3", "note": "same phrase: نَزَّلَ عَلَيْكَ الْكِتَابَ"},
4+
{"ayah1": "55:13", "ayah2": "55:16", "note": "repeated rhetorical verse in Ar-Rahman"},
5+
{"ayah1": "93:7", "ayah2": "94:5", "note": "memorization reminder pair"}
6+
]

scripts/import/main.go

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
package main
2+
3+
import (
4+
"bufio"
5+
"encoding/json"
6+
"fmt"
7+
"io"
8+
"net/http"
9+
"os"
10+
"path/filepath"
11+
"regexp"
12+
"sort"
13+
"strconv"
14+
"strings"
15+
)
16+
17+
const (
18+
textURL = "https://tanzil.net/pub/download/index.php?quranType=uthmani&outType=txt-2&agree=true&marks=true&sajdah=true&tatweel=true"
19+
metaURL = "https://tanzil.net/res/text/metadata/quran-data.js"
20+
)
21+
22+
type ayah struct {
23+
Surah int `json:"surah"`
24+
SurahName string `json:"surah_name"`
25+
Ayah int `json:"ayah"`
26+
Juz int `json:"juz"`
27+
TextAR string `json:"text_ar"`
28+
}
29+
30+
type point struct {
31+
Surah int
32+
Ayah int
33+
}
34+
35+
func main() {
36+
if err := run(); err != nil {
37+
fmt.Fprintf(os.Stderr, "import failed: %v\n", err)
38+
os.Exit(1)
39+
}
40+
}
41+
42+
func run() error {
43+
metadata, err := fetch(metaURL)
44+
if err != nil {
45+
return fmt.Errorf("fetch metadata: %w", err)
46+
}
47+
48+
surahNames, err := parseSurahNames(metadata)
49+
if err != nil {
50+
return fmt.Errorf("parse surah names: %w", err)
51+
}
52+
53+
juzStarts, err := parseJuzStarts(metadata)
54+
if err != nil {
55+
return fmt.Errorf("parse juz starts: %w", err)
56+
}
57+
58+
textData, err := fetch(textURL)
59+
if err != nil {
60+
return fmt.Errorf("fetch quran text: %w", err)
61+
}
62+
63+
ayahs, err := parseAyahLines(textData, surahNames, juzStarts)
64+
if err != nil {
65+
return fmt.Errorf("parse ayah lines: %w", err)
66+
}
67+
68+
if err := validate(ayahs); err != nil {
69+
return err
70+
}
71+
72+
outPath := filepath.Join("data", "quran.json")
73+
if err := writeJSON(outPath, ayahs); err != nil {
74+
return fmt.Errorf("write dataset: %w", err)
75+
}
76+
77+
fmt.Printf("Imported %d ayahs into %s\n", len(ayahs), outPath)
78+
return nil
79+
}
80+
81+
func fetch(url string) (string, error) {
82+
resp, err := http.Get(url)
83+
if err != nil {
84+
return "", err
85+
}
86+
defer resp.Body.Close()
87+
88+
if resp.StatusCode != http.StatusOK {
89+
return "", fmt.Errorf("unexpected status: %s", resp.Status)
90+
}
91+
92+
b, err := io.ReadAll(resp.Body)
93+
if err != nil {
94+
return "", err
95+
}
96+
97+
return string(b), nil
98+
}
99+
100+
func parseSurahNames(meta string) (map[int]string, error) {
101+
section, err := extractArray(meta, "QuranData.Sura")
102+
if err != nil {
103+
return nil, err
104+
}
105+
106+
re := regexp.MustCompile(`(?m)^\s*\[(\d+),\s*\d+,\s*\d+,\s*\d+,\s*'[^']*',\s*"([^"]+)"`)
107+
matches := re.FindAllStringSubmatch(section, -1)
108+
if len(matches) < 114 {
109+
return nil, fmt.Errorf("expected 114 surah rows, got %d", len(matches))
110+
}
111+
112+
names := make(map[int]string, 114)
113+
for idx, m := range matches {
114+
n := idx + 1
115+
names[n] = strings.TrimSpace(m[2])
116+
}
117+
return names, nil
118+
}
119+
120+
func parseJuzStarts(meta string) ([]point, error) {
121+
section, err := extractArray(meta, "QuranData.Juz")
122+
if err != nil {
123+
return nil, err
124+
}
125+
126+
re := regexp.MustCompile(`\[(\d+),\s*(\d+)\]`)
127+
matches := re.FindAllStringSubmatch(section, -1)
128+
if len(matches) < 31 {
129+
return nil, fmt.Errorf("expected >=31 juz points, got %d", len(matches))
130+
}
131+
132+
out := make([]point, 0, len(matches))
133+
for _, m := range matches {
134+
s, _ := strconv.Atoi(m[1])
135+
a, _ := strconv.Atoi(m[2])
136+
out = append(out, point{Surah: s, Ayah: a})
137+
}
138+
139+
return out, nil
140+
}
141+
142+
func extractArray(meta, marker string) (string, error) {
143+
start := strings.Index(meta, marker)
144+
if start == -1 {
145+
return "", fmt.Errorf("marker not found: %s", marker)
146+
}
147+
from := strings.Index(meta[start:], "[")
148+
if from == -1 {
149+
return "", fmt.Errorf("array start not found for: %s", marker)
150+
}
151+
idx := start + from
152+
153+
depth := 0
154+
for i := idx; i < len(meta); i++ {
155+
switch meta[i] {
156+
case '[':
157+
depth++
158+
case ']':
159+
depth--
160+
if depth == 0 {
161+
return meta[idx : i+1], nil
162+
}
163+
}
164+
}
165+
166+
return "", fmt.Errorf("array end not found for: %s", marker)
167+
}
168+
169+
func parseAyahLines(text string, names map[int]string, juzStarts []point) ([]ayah, error) {
170+
scanner := bufio.NewScanner(strings.NewReader(text))
171+
scanner.Buffer(make([]byte, 0, 64*1024), 2*1024*1024)
172+
173+
out := make([]ayah, 0, 6236)
174+
for scanner.Scan() {
175+
line := strings.TrimSpace(scanner.Text())
176+
if line == "" || strings.HasPrefix(line, "#") {
177+
continue
178+
}
179+
if !strings.Contains(line, "|") {
180+
continue
181+
}
182+
parts := strings.SplitN(line, "|", 3)
183+
if len(parts) != 3 {
184+
return nil, fmt.Errorf("invalid text line: %q", line)
185+
}
186+
187+
surah, err := strconv.Atoi(parts[0])
188+
if err != nil {
189+
return nil, fmt.Errorf("invalid surah in line %q", line)
190+
}
191+
ayahNum, err := strconv.Atoi(parts[1])
192+
if err != nil {
193+
return nil, fmt.Errorf("invalid ayah in line %q", line)
194+
}
195+
196+
name := names[surah]
197+
if name == "" {
198+
return nil, fmt.Errorf("missing surah name for surah %d", surah)
199+
}
200+
201+
out = append(out, ayah{
202+
Surah: surah,
203+
SurahName: name,
204+
Ayah: ayahNum,
205+
Juz: determineJuz(surah, ayahNum, juzStarts),
206+
TextAR: parts[2],
207+
})
208+
}
209+
210+
if err := scanner.Err(); err != nil {
211+
return nil, err
212+
}
213+
214+
sort.Slice(out, func(i, j int) bool {
215+
if out[i].Surah == out[j].Surah {
216+
return out[i].Ayah < out[j].Ayah
217+
}
218+
return out[i].Surah < out[j].Surah
219+
})
220+
221+
return out, nil
222+
}
223+
224+
func determineJuz(surah, ayah int, starts []point) int {
225+
for i := 0; i < len(starts)-1; i++ {
226+
if before(surah, ayah, starts[i+1].Surah, starts[i+1].Ayah) {
227+
return i + 1
228+
}
229+
}
230+
return 30
231+
}
232+
233+
func before(s1, a1, s2, a2 int) bool {
234+
if s1 != s2 {
235+
return s1 < s2
236+
}
237+
return a1 < a2
238+
}
239+
240+
func validate(ayahs []ayah) error {
241+
if len(ayahs) != 6236 {
242+
return fmt.Errorf("expected 6236 ayahs, got %d", len(ayahs))
243+
}
244+
245+
seen := make(map[string]struct{}, len(ayahs))
246+
for _, a := range ayahs {
247+
if a.Surah < 1 || a.Surah > 114 {
248+
return fmt.Errorf("invalid surah: %d", a.Surah)
249+
}
250+
if a.Ayah < 1 {
251+
return fmt.Errorf("invalid ayah: %d", a.Ayah)
252+
}
253+
if a.Juz < 1 || a.Juz > 30 {
254+
return fmt.Errorf("invalid juz: %d for %d:%d", a.Juz, a.Surah, a.Ayah)
255+
}
256+
if strings.TrimSpace(a.TextAR) == "" {
257+
return fmt.Errorf("empty text_ar for %d:%d", a.Surah, a.Ayah)
258+
}
259+
260+
k := fmt.Sprintf("%d:%d", a.Surah, a.Ayah)
261+
if _, ok := seen[k]; ok {
262+
return fmt.Errorf("duplicate ayah key: %s", k)
263+
}
264+
seen[k] = struct{}{}
265+
}
266+
267+
return nil
268+
}
269+
270+
func writeJSON(path string, v any) error {
271+
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
272+
return err
273+
}
274+
275+
f, err := os.Create(path)
276+
if err != nil {
277+
return err
278+
}
279+
defer f.Close()
280+
281+
enc := json.NewEncoder(f)
282+
enc.SetIndent("", " ")
283+
enc.SetEscapeHTML(false)
284+
return enc.Encode(v)
285+
}

0 commit comments

Comments
 (0)