Skip to content

Commit f4b3430

Browse files
committed
feat(document): improve better title extraction
1 parent 3dad2f1 commit f4b3430

File tree

2 files changed

+367
-1
lines changed

2 files changed

+367
-1
lines changed

backend/internal/application/services/document_service.go

Lines changed: 134 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,21 +163,154 @@ func extractTitleFromURL(urlStr string) string {
163163
return urlStr
164164
}
165165

166+
// Remove query parameters
166167
if idx := strings.Index(lastSegment, "?"); idx >= 0 {
167168
lastSegment = lastSegment[:idx]
168169
}
169170

171+
// Remove fragment
170172
if idx := strings.Index(lastSegment, "#"); idx >= 0 {
171173
lastSegment = lastSegment[:idx]
172174
}
173175

176+
// Remove file extension
174177
if idx := strings.LastIndex(lastSegment, "."); idx > 0 {
175-
return lastSegment[:idx]
178+
lastSegment = lastSegment[:idx]
176179
}
177180

181+
// Clean up hash/ID suffixes (Notion, GitHub, GitLab, etc.)
182+
lastSegment = cleanHashSuffix(lastSegment)
183+
178184
return lastSegment
179185
}
180186

187+
// cleanHashSuffix removes common hash/ID patterns appended by various platforms
188+
// Examples:
189+
// - "Introduction-to-Cybersecurity-26b2915834718093a062f54c798d63c5" -> "Introduction-to-Cybersecurity"
190+
// - "My-Document-abc123def456" -> "My-Document"
191+
// - "Report-2024-1a2b3c4d5e6f" -> "Report-2024"
192+
func cleanHashSuffix(title string) string {
193+
// Pattern 1: Remove UUID-like suffixes (with dashes) - check this first before splitting
194+
// Example: "Title-a1b2c3d4-e5f6-7890-abcd-ef1234567890" -> "Title"
195+
// UUID format: 8-4-4-4-12 = 36 chars total with dashes
196+
parts := strings.Split(title, "-")
197+
if len(parts) >= 6 {
198+
// Check if last 5 segments form a UUID pattern
199+
potentialUUID := strings.Join(parts[len(parts)-5:], "-")
200+
cleanUUID := strings.ReplaceAll(potentialUUID, "-", "")
201+
if len(cleanUUID) == 32 && isHexString(cleanUUID) {
202+
return strings.Join(parts[:len(parts)-5], "-")
203+
}
204+
}
205+
206+
// Pattern 2: Remove long hexadecimal suffixes (24+ chars) - Notion style
207+
// Example: "Title-26b2915834718093a062f54c798d63c5" -> "Title"
208+
if idx := strings.LastIndex(title, "-"); idx > 0 {
209+
suffix := title[idx+1:]
210+
if len(suffix) >= 24 && isHexString(suffix) {
211+
return title[:idx]
212+
}
213+
}
214+
215+
// Pattern 3: Remove short hash suffixes (8-16 chars) only if alphanumeric
216+
// Example: "Document-abc123def" -> "Document"
217+
if idx := strings.LastIndex(title, "-"); idx > 0 {
218+
suffix := title[idx+1:]
219+
if len(suffix) >= 8 && len(suffix) <= 16 && isAlphanumeric(suffix) && hasLettersAndNumbers(suffix) {
220+
return title[:idx]
221+
}
222+
}
223+
224+
// Pattern 4: Remove numeric-only suffixes (timestamps, IDs) 8+ digits
225+
// Example: "Article-1234567890" -> "Article"
226+
if idx := strings.LastIndex(title, "-"); idx > 0 {
227+
suffix := title[idx+1:]
228+
if len(suffix) >= 8 && isNumericString(suffix) {
229+
return title[:idx]
230+
}
231+
}
232+
233+
// Pattern 5: Remove base64-like suffixes (URL-safe base64)
234+
// Example: "Page-aGVsbG93b3JsZA" -> "Page"
235+
if idx := strings.LastIndex(title, "-"); idx > 0 {
236+
suffix := title[idx+1:]
237+
if len(suffix) >= 12 && isBase64Like(suffix) {
238+
return title[:idx]
239+
}
240+
}
241+
242+
return title
243+
}
244+
245+
// isHexString checks if a string contains only hexadecimal characters (0-9, a-f, A-F)
246+
func isHexString(s string) bool {
247+
if len(s) == 0 {
248+
return false
249+
}
250+
for _, ch := range s {
251+
if !((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')) {
252+
return false
253+
}
254+
}
255+
return true
256+
}
257+
258+
// isAlphanumeric checks if string contains only letters and numbers
259+
func isAlphanumeric(s string) bool {
260+
if len(s) == 0 {
261+
return false
262+
}
263+
for _, ch := range s {
264+
if !((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
265+
return false
266+
}
267+
}
268+
return true
269+
}
270+
271+
// hasLettersAndNumbers checks if string contains both letters AND numbers (likely a hash)
272+
func hasLettersAndNumbers(s string) bool {
273+
hasLetter := false
274+
hasNumber := false
275+
for _, ch := range s {
276+
if (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') {
277+
hasLetter = true
278+
}
279+
if ch >= '0' && ch <= '9' {
280+
hasNumber = true
281+
}
282+
}
283+
return hasLetter && hasNumber
284+
}
285+
286+
// isNumericString checks if string contains only digits
287+
func isNumericString(s string) bool {
288+
if len(s) == 0 {
289+
return false
290+
}
291+
for _, ch := range s {
292+
if ch < '0' || ch > '9' {
293+
return false
294+
}
295+
}
296+
return true
297+
}
298+
299+
// isBase64Like checks if string looks like base64 encoding
300+
func isBase64Like(s string) bool {
301+
if len(s) == 0 {
302+
return false
303+
}
304+
base64Chars := 0
305+
for _, ch := range s {
306+
if (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_' || ch == '-' {
307+
base64Chars++
308+
}
309+
}
310+
// If 90%+ of chars are base64-compatible, likely base64
311+
return float64(base64Chars)/float64(len(s)) >= 0.9
312+
}
313+
181314
// computeChecksumForURL attempts to compute the checksum for a remote URL
182315
// Returns nil if the checksum cannot be computed (error, too large, etc.)
183316
func (s *DocumentService) computeChecksumForURL(url string) *checksum.Result {

0 commit comments

Comments
 (0)