Skip to content

Commit 206b978

Browse files
committed
fix(feedback): improve ESOURCE link parsing
- Return empty on non-2xx responses (404 was leaking footer links) - Detect redirects via res.redirected instead of dead res.status === 302 - Use DOMParser to scope parsing to .list-group-item links only - Remove brittle SKIP_URLS blocklist and page-wide href regex
1 parent 9115286 commit 206b978

File tree

2 files changed

+114
-61
lines changed

2 files changed

+114
-61
lines changed

src/lib/__tests__/useGetResourceLinks.test.ts

Lines changed: 91 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,51 @@ import { beforeEach, describe, expect, test, vi } from 'vitest';
22
import { fetchUrl, transformUrl } from '../useGetResourceLinks';
33

44
const htmlWithLinks = `
5-
<script src="my-script.js" defer=""></script>
6-
<script src="my-styles.css" defer=""></script>
7-
<a href="https://arxiv.org/pdf/1234.5678.pdf">arXiv</a>
8-
<a href="https://doi.org/10.1234/abcd">DOI</a>
9-
<a href="https://example.com/page.html">HTML</a>
10-
<a href="https://example.com/style.css">CSS</a>
11-
<a href="http://www.nasa.gov">NASA</a>
12-
<a href="https://example.com/document.pdf">PDF</a>
13-
<a href="https://example.com/page.html">HTML Duplicate</a>
5+
<body>
6+
<div class="header-container">
7+
<header class="starry-background-wrapper">
8+
<div class="logo-header">
9+
<a href="/">
10+
<img src="/styles/img/scix-logo.svg" alt="Science Explorer Logo">
11+
<b>Science Explorer</b>
12+
</a>
13+
</div>
14+
</header>
15+
</div>
16+
<div class="main-container container-sm">
17+
<h3 class="text-center">links for <a href="/abs/2023ApJ/abstract"><b>2023ApJ</b></a></h3>
18+
<div class="list-group">
19+
<div class="list-group-item">
20+
<a href="https://arxiv.org/abs/2310.03851" class="title">https://arxiv.org/abs/2310.03851</a>
21+
</div>
22+
<div class="list-group-item">
23+
<a href="https://arxiv.org/pdf/2310.03851" class="title">https://arxiv.org/pdf/2310.03851</a>
24+
</div>
25+
<div class="list-group-item">
26+
<a href="https://doi.org/10.3847/1538-4357/acffbd" class="title">https://doi.org/10.3847/1538-4357/acffbd</a>
27+
</div>
28+
<div class="list-group-item">
29+
<a href="https://example.com/document.pdf" class="title">https://example.com/document.pdf</a>
30+
</div>
31+
</div>
32+
</div>
33+
<div class="footer-container">
34+
<footer>
35+
<a href="http://www.si.edu" target="_blank">Smithsonian</a>
36+
<a href="https://www.cfa.harvard.edu/" target="_blank">CfA</a>
37+
<a href="http://www.nasa.gov" target="_blank">NASA</a>
38+
<a href="https://scixplorer.org/scixabout">About SciX</a>
39+
<a href="https://scixplorer.org/feedback/general">Give Feedback</a>
40+
<a href="https://twitter.com/scixcommunity">@scixcommunity</a>
41+
</footer>
42+
</div>
43+
</body>
1444
`;
1545

16-
const SKIP_URLS = [
17-
'http://www.cfa.harvard.edu/sao',
18-
'https://www.cfa.harvard.edu/',
19-
'http://www.si.edu',
20-
'http://www.nasa.gov',
21-
];
22-
2346
const expectedUrls = [
24-
{ type: 'arXiv', url: 'https://arxiv.org/pdf/1234.5678.pdf' },
25-
{ type: 'DOI', url: 'https://doi.org/10.1234/abcd' },
26-
{ type: 'HTML', url: 'https://example.com/page.html' },
47+
{ type: 'arXiv', url: 'https://arxiv.org/abs/2310.03851' },
48+
{ type: 'arXiv', url: 'https://arxiv.org/pdf/2310.03851' },
49+
{ type: 'DOI', url: 'https://doi.org/10.3847/1538-4357/acffbd' },
2750
{ type: 'PDF', url: 'https://example.com/document.pdf' },
2851
];
2952

@@ -38,12 +61,6 @@ describe('resourceLinks', () => {
3861
expect(transformUrl('https://example.com/script.js')).toBeNull();
3962
});
4063

41-
test('transformUrl filters known skipped domains', () => {
42-
for (const url of SKIP_URLS) {
43-
expect(transformUrl(url)).toBeNull();
44-
}
45-
});
46-
4764
test('transformUrl assigns correct type', () => {
4865
expect(transformUrl('https://arxiv.org/pdf/foo.pdf')).toEqual({
4966
type: 'arXiv',
@@ -81,6 +98,8 @@ describe('resourceLinks', () => {
8198
test('fetchUrl returns deduplicated transformed links', async () => {
8299
const mockFetch = global.fetch as unknown as ReturnType<typeof vi.fn>;
83100
mockFetch.mockResolvedValueOnce({
101+
ok: true,
102+
redirected: false,
84103
text: () => Promise.resolve(htmlWithLinks),
85104
});
86105

@@ -91,7 +110,9 @@ describe('resourceLinks', () => {
91110
test('fetchUrl returns empty list if input has no valid links', async () => {
92111
const mockFetch = global.fetch as unknown as ReturnType<typeof vi.fn>;
93112
mockFetch.mockResolvedValueOnce({
94-
text: () => Promise.resolve('<p>No links here</p>'),
113+
ok: true,
114+
redirected: false,
115+
text: () => Promise.resolve('<div class="list-group"></div>'),
95116
});
96117

97118
const result = await fetchUrl('fake-id');
@@ -104,14 +125,14 @@ describe('Redirected response', () => {
104125
vi.resetAllMocks();
105126
global.fetch = vi.fn();
106127
});
107-
test('fetchUrl handles 302 redirect and uses Location header', async () => {
128+
129+
test('fetchUrl detects browser-followed redirect via res.redirected', async () => {
108130
const mockFetch = global.fetch as unknown as ReturnType<typeof vi.fn>;
109131
mockFetch.mockResolvedValueOnce({
110-
status: 302,
111-
headers: {
112-
get: (name: string) => (name === 'Location' ? 'https://doi.org/10.1234/foo' : null),
113-
},
114-
text: () => Promise.resolve(''), // not used in redirect
132+
ok: true,
133+
redirected: true,
134+
url: 'https://doi.org/10.1234/foo',
135+
text: () => Promise.resolve(''),
115136
});
116137

117138
const result = await fetchUrl('test-id');
@@ -124,18 +145,51 @@ describe('Redirected response', () => {
124145
]);
125146
});
126147

127-
test('fetchUrl returns empty if redirect has no Location', async () => {
148+
test('fetchUrl returns empty if redirected URL is not valid', async () => {
128149
const mockFetch = global.fetch as unknown as ReturnType<typeof vi.fn>;
129150
mockFetch.mockResolvedValueOnce({
130-
status: 302,
131-
headers: {
132-
get: (() => null) as (name: string) => string | null,
133-
},
151+
ok: true,
152+
redirected: true,
153+
url: '',
134154
text: () => Promise.resolve(''),
135155
});
136156

137157
const result = await fetchUrl('test-id');
158+
expect(result).toEqual([]);
159+
});
160+
});
161+
162+
describe('Error responses', () => {
163+
beforeEach(() => {
164+
vi.resetAllMocks();
165+
global.fetch = vi.fn();
166+
});
167+
168+
test('fetchUrl returns empty list on 404', async () => {
169+
const mockFetch = global.fetch as unknown as ReturnType<typeof vi.fn>;
170+
mockFetch.mockResolvedValueOnce({
171+
ok: false,
172+
status: 404,
173+
text: () =>
174+
Promise.resolve(
175+
'<h3>The requested resource does not exist</h3>' +
176+
'<footer><a href="https://scixplorer.org/scixabout">About</a></footer>',
177+
),
178+
});
179+
180+
const result = await fetchUrl('bad-bibcode');
181+
expect(result).toEqual([]);
182+
});
183+
184+
test('fetchUrl returns empty list on 500', async () => {
185+
const mockFetch = global.fetch as unknown as ReturnType<typeof vi.fn>;
186+
mockFetch.mockResolvedValueOnce({
187+
ok: false,
188+
status: 500,
189+
text: () => Promise.resolve('Internal Server Error'),
190+
});
138191

192+
const result = await fetchUrl('error-bibcode');
139193
expect(result).toEqual([]);
140194
});
141195
});

src/lib/useGetResourceLinks.ts

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { isValidURL } from '@/utils/common/isValidURL';
33

44
export const resourceUrlTypes = ['arXiv', 'PDF', 'DOI', 'HTML', 'Other'] as const;
55

6-
export type ResourceUrlType = typeof resourceUrlTypes[number];
6+
export type ResourceUrlType = (typeof resourceUrlTypes)[number];
77

88
export interface IResourceUrl {
99
type: ResourceUrlType;
@@ -15,14 +15,6 @@ interface IUseResourceLinksProps {
1515
options?: UseQueryOptions<IResourceUrl[]>;
1616
}
1717

18-
// TODO: slightly brittle, since these links could change over time
19-
const SKIP_URLS = [
20-
'http://www.cfa.harvard.edu/sao',
21-
'https://www.cfa.harvard.edu/',
22-
'http://www.si.edu',
23-
'http://www.nasa.gov',
24-
];
25-
2618
const URL_TYPE_MAP: Record<string, ResourceUrlType> = {
2719
arxiv: 'arXiv',
2820
pdf: 'PDF',
@@ -31,14 +23,13 @@ const URL_TYPE_MAP: Record<string, ResourceUrlType> = {
3123
};
3224

3325
const RESOURCE_EXT_REGEX = /\.(jpg|jpeg|png|gif|webp|svg|css|js|ico|woff2?|ttf|otf|eot|map|mp4|webm)(\?|$)/i;
34-
const URL_REGX = /href="(https?:\/\/[^"]*)"/gi;
3526

3627
/**
3728
* Transforms a URL into a structured resource link object.
3829
* @param url
3930
*/
4031
export const transformUrl = (url: string) => {
41-
if (!url || typeof url !== 'string' || !isValidURL(url) || RESOURCE_EXT_REGEX.test(url) || SKIP_URLS.includes(url)) {
32+
if (!url || typeof url !== 'string' || !isValidURL(url) || RESOURCE_EXT_REGEX.test(url)) {
4233
return null;
4334
}
4435

@@ -56,29 +47,37 @@ export const fetchUrl = async (identifier: string): Promise<IResourceUrl[]> => {
5647
const url = `/link_gateway/${encodeURIComponent(identifier)}/ESOURCE`;
5748
const res = await fetch(url);
5849

59-
// check for 302 redirects
60-
if (res.status === 302 || res.status === 301) {
61-
const redirectUrl = res.headers.get('Location');
62-
if (redirectUrl) {
63-
const transformedUrl = transformUrl(redirectUrl);
64-
return transformedUrl ? [transformedUrl] : [];
65-
}
50+
if (!res.ok) {
6651
return [];
6752
}
6853

54+
// single-link resources redirect directly to the target URL
55+
if (res.redirected) {
56+
const transformedUrl = transformUrl(res.url);
57+
return transformedUrl ? [transformedUrl] : [];
58+
}
59+
6960
const raw = await res.text();
7061
if (!raw) {
7162
return [];
7263
}
7364

74-
const seen = new Set<string>();
75-
const result = Array.from(raw.matchAll(URL_REGX), ([, href]) => transformUrl(href));
65+
const parser = new DOMParser();
66+
const doc = parser.parseFromString(raw, 'text/html');
67+
const links = doc.querySelectorAll('.list-group-item a');
7668

69+
const seen = new Set<string>();
7770
const output: IResourceUrl[] = [];
78-
for (const res of result) {
79-
if (res && !seen.has(res.url)) {
80-
seen.add(res.url);
81-
output.push(res);
71+
72+
for (const link of links) {
73+
const href = link.getAttribute('href');
74+
if (!href) {
75+
continue;
76+
}
77+
const transformed = transformUrl(href);
78+
if (transformed && !seen.has(transformed.url)) {
79+
seen.add(transformed.url);
80+
output.push(transformed);
8281
}
8382
}
8483

0 commit comments

Comments
 (0)