Skip to content

Commit 4ed5e0f

Browse files
committed
♻️ Synced dbin 📦 <-- appstream scrapper: use only EN fields ⌚
1 parent 18d9078 commit 4ed5e0f

File tree

1 file changed

+97
-25
lines changed

1 file changed

+97
-25
lines changed

misc/cmd/flatpakAppStreamScrapper/scrapper.go

Lines changed: 97 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,14 @@ import (
1111
"strconv"
1212
"strings"
1313

14+
"jaytaylor.com/html2text"
1415
"github.com/fxamacker/cbor/v2"
1516
"github.com/goccy/go-json"
1617
"github.com/shamaton/msgpack/v2"
1718
minify "github.com/tdewolff/minify/v2"
1819
mjson "github.com/tdewolff/minify/v2/json"
1920
)
2021

21-
const lang = ""
22-
2322
type Tag struct {
2423
XMLName xml.Name
2524
Content string `xml:",innerxml"`
@@ -54,13 +53,21 @@ type Releases struct {
5453
}
5554

5655
type Component struct {
57-
Name []Tag `xml:"name"`
58-
Screenshots []Screenshot `xml:"screenshots>screenshot"`
59-
Summary []Tag `xml:"summary"`
60-
Description []Tag `xml:"description>p"`
61-
Categories []Tag `xml:"categories>category"`
62-
Keywords []Tag `xml:"keywords>keyword"`
63-
Icons []struct {
56+
Names []struct {
57+
Lang string `xml:"lang,attr"`
58+
Content string `xml:",chardata"`
59+
} `xml:"name"`
60+
Summaries []struct {
61+
Lang string `xml:"lang,attr"`
62+
Content string `xml:",chardata"`
63+
} `xml:"summary"`
64+
Descriptions []struct {
65+
Lang string `xml:"lang,attr"`
66+
Content string `xml:",innerxml"`
67+
} `xml:"description"`
68+
Categories []Tag `xml:"categories>category"`
69+
Keywords []Tag `xml:"keywords>keyword"`
70+
Icons []struct {
6471
Type string `xml:"type,attr"`
6572
Width string `xml:"width,attr"`
6673
Height string `xml:"height,attr"`
@@ -78,6 +85,7 @@ type Component struct {
7885
} `xml:"launchable"`
7986
ContentRating []Tag `xml:"content_rating"`
8087
Releases Releases `xml:"releases"`
88+
Screenshots []Screenshot `xml:"screenshots>screenshot"`
8189
}
8290

8391
type AppStreamData struct {
@@ -126,24 +134,32 @@ func saveCBOR(filename string, metadata []AppStreamData) error {
126134
}
127135
return os.WriteFile(filename+".cbor", cborData, 0644)
128136
}
137+
129138
func saveJSON(filename string, metadata []AppStreamData) error {
130-
jsonData, err := json.MarshalIndent(metadata, "", " ")
131-
if err != nil {
139+
var buffer strings.Builder
140+
encoder := json.NewEncoder(&buffer)
141+
encoder.SetEscapeHTML(false) // Prevent escaping HTML tags
142+
encoder.SetIndent("", " ")
143+
144+
if err := encoder.Encode(metadata); err != nil {
132145
return err
133146
}
147+
148+
jsonData := []byte(buffer.String())
134149
if err := os.WriteFile(filename+".json", jsonData, 0644); err != nil {
135150
return err
136151
}
137-
// Minify JSON
152+
138153
m := minify.New()
139154
m.AddFunc("application/json", mjson.Minify)
140-
if jsonData, err = m.Bytes("application/json", jsonData); err != nil {
155+
if minifiedData, err := m.Bytes("application/json", jsonData); err != nil {
141156
return err
142-
} else if err := os.WriteFile(filename+".min.json", jsonData, 0644); err != nil {
157+
} else if err := os.WriteFile(filename+".min.json", minifiedData, 0644); err != nil {
143158
return err
144159
}
145160
return nil
146161
}
162+
147163
func saveMsgp(filename string, metadata []AppStreamData) error {
148164
msgpData, err := msgpack.Marshal(metadata)
149165
if err != nil {
@@ -162,15 +178,24 @@ func getCategoriesString(categories []Tag) string {
162178
return strings.Join(categoryStrings, ",")
163179
}
164180

165-
func getRichDescription(descriptions []Tag) string {
181+
func getRichDescription(descriptions []struct {
182+
Lang string `xml:"lang,attr"`
183+
Content string `xml:",innerxml"`
184+
}) string {
166185
return getContentByLang(descriptions)
167186
}
168187

169-
func getName(names []Tag) string {
188+
func getName(names []struct {
189+
Lang string `xml:"lang,attr"`
190+
Content string `xml:",chardata"`
191+
}) string {
170192
return getContentByLang(names)
171193
}
172194

173-
func getSummary(summaries []Tag) string {
195+
func getSummary(summaries []struct {
196+
Lang string `xml:"lang,attr"`
197+
Content string `xml:",chardata"`
198+
}) string {
174199
return getContentByLang(summaries)
175200
}
176201

@@ -182,12 +207,60 @@ func getContentRating(ratings []Tag) string {
182207
return contentRating.String()
183208
}
184209

185-
func getContentByLang(tags []Tag) string {
186-
for _, tag := range tags {
187-
if tag.Lang == lang {
188-
return tag.Content
210+
func getContentByLang[T any](elements []T) string {
211+
for _, elem := range elements {
212+
switch v := any(elem).(type) {
213+
case struct {
214+
Lang string `xml:"lang,attr"`
215+
Content string `xml:",chardata"`
216+
}:
217+
if v.Lang == "en" || v.Lang == "en_US" || v.Lang == "en_GB" {
218+
return strings.TrimSpace(v.Content)
219+
}
220+
case struct {
221+
Lang string `xml:"lang,attr"`
222+
Content string `xml:",innerxml"`
223+
}:
224+
if v.Lang == "en" || v.Lang == "en_US" || v.Lang == "en_GB" {
225+
return strings.TrimSpace(v.Content)
226+
}
189227
}
190228
}
229+
230+
for _, elem := range elements {
231+
switch v := any(elem).(type) {
232+
case struct {
233+
Lang string `xml:"lang,attr"`
234+
Content string `xml:",chardata"`
235+
}:
236+
if v.Lang == "" {
237+
return strings.TrimSpace(v.Content)
238+
}
239+
case struct {
240+
Lang string `xml:"lang,attr"`
241+
Content string `xml:",innerxml"`
242+
}:
243+
if v.Lang == "" {
244+
return strings.TrimSpace(v.Content)
245+
}
246+
}
247+
}
248+
249+
if len(elements) > 0 {
250+
switch v := any(elements[0]).(type) {
251+
case struct {
252+
Lang string `xml:"lang,attr"`
253+
Content string `xml:",chardata"`
254+
}:
255+
return strings.TrimSpace(v.Content)
256+
case struct {
257+
Lang string `xml:"lang,attr"`
258+
Content string `xml:",innerxml"`
259+
}:
260+
return strings.TrimSpace(v.Content)
261+
}
262+
}
263+
191264
return ""
192265
}
193266

@@ -231,7 +304,6 @@ func main() {
231304
}
232305

233306
for _, screenshot := range component.Screenshots {
234-
// Sort images by area (largest first)
235307
sort.Slice(screenshot.Images, func(i, j int) bool {
236308
widthI, _ := strconv.Atoi(screenshot.Images[i].Width)
237309
heightI, _ := strconv.Atoi(screenshot.Images[i].Height)
@@ -250,9 +322,9 @@ func main() {
250322
}
251323

252324
categories := getCategoriesString(component.Categories)
253-
richDescription := getRichDescription(component.Description)
254-
name := getName(component.Name)
255-
summary := getSummary(component.Summary)
325+
richDescription := getRichDescription(component.Descriptions)
326+
name := getName(component.Names)
327+
summary := getSummary(component.Summaries)
256328
contentRating := getContentRating(component.ContentRating)
257329
version := ""
258330
if len(component.Releases.Release) > 0 {

0 commit comments

Comments
 (0)