Skip to content

Commit 2d69418

Browse files
authored
Merge pull request #736 from guardian/pf/ingest-PA_API-as-PA-supplier
Map `source-feed: PA_API` to `supplier: PA` on ingestion
2 parents f5c81b0 + 92846ee commit 2d69418

File tree

12 files changed

+239
-161
lines changed

12 files changed

+239
-161
lines changed

ingestion-lambda/partition.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
export function partition<T>(
2+
arr: T[],
3+
predicate: (item: T) => boolean,
4+
): [T[], T[]] {
5+
const isTrueOf: T[] = [];
6+
const isFalseOf: T[] = [];
7+
arr.forEach((item) => {
8+
if (predicate(item)) {
9+
isTrueOf.push(item);
10+
} else {
11+
isFalseOf.push(item);
12+
}
13+
});
14+
return [isTrueOf, isFalseOf];
15+
}

ingestion-lambda/src/categoryCodes.test.ts

Lines changed: 77 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import {
44
processFingerpostAAPCategoryCodes,
55
processFingerpostAFPCategoryCodes,
66
processFingerpostAPCategoryCodes,
7-
processFingerpostPAAPICategoryCodes,
87
processFingerpostPACategoryCodes,
98
processReutersDestinationCodes,
109
processUnknownFingerpostCategoryCodes,
@@ -193,77 +192,92 @@ describe('processFingerpostAFPCategoryCodes', () => {
193192

194193
describe('processFingerpostPACategoryCodes', () => {
195194
describe('processFingerpostPACategoryCodes', () => {
196-
it('should return an empty array if provided with an empty array', () => {
197-
expect(processFingerpostPACategoryCodes([])).toEqual([]);
198-
});
199-
200-
it('should strip out service codes', () => {
201-
expect(processFingerpostPACategoryCodes(['service:news'])).toEqual([]);
202-
});
203-
204-
it('should strip out empty iptccat entries', () => {
205-
expect(
206-
processFingerpostPACategoryCodes(['iptccat:', 'iptccat:a']),
207-
).toEqual(['paCat:a']);
195+
describe('when mediaCatCodes is not provided', () => {
196+
it('should return an empty array if provided with an empty array', () => {
197+
expect(processFingerpostPACategoryCodes([])).toEqual([]);
198+
});
199+
200+
it('should strip out service codes', () => {
201+
expect(processFingerpostPACategoryCodes(['service:news'])).toEqual([]);
202+
});
203+
204+
it('should strip out empty iptccat entries', () => {
205+
expect(
206+
processFingerpostPACategoryCodes(['iptccat:', 'iptccat:a']),
207+
).toEqual(['paCat:a']);
208+
});
209+
210+
it('should return simple codes labelled "iptccat" as simple "paCat" codes', () => {
211+
expect(
212+
processFingerpostPACategoryCodes(['iptccat:a', 'iptccat:b']),
213+
).toEqual(['paCat:a', 'paCat:b']);
214+
});
215+
216+
it('should expand category codes with multiple subcodes', () => {
217+
expect(processFingerpostPACategoryCodes(['iptccat:c+d'])).toEqual([
218+
'paCat:c',
219+
'paCat:d',
220+
]);
221+
});
222+
223+
it('should remove empty strings', () => {
224+
expect(
225+
processFingerpostPACategoryCodes(['iptccat:a', '', 'iptccat:c']),
226+
).toEqual(['paCat:a', 'paCat:c']);
227+
});
228+
229+
it('should remove trailing and leading whitespace', () => {
230+
expect(
231+
processFingerpostPACategoryCodes([
232+
'iptccat:a ',
233+
' iptccat:c',
234+
' service:news ',
235+
'qCode:value ',
236+
'iptccat: ',
237+
]),
238+
).toEqual(['paCat:a', 'paCat:c', 'qCode:value']);
239+
});
240+
241+
it('should deduplicate category codes after stripping whitespace', () => {
242+
expect(
243+
processFingerpostPACategoryCodes([
244+
'iptccat:a ',
245+
' iptccat:a',
246+
'iptccat:c',
247+
]),
248+
).toEqual(['paCat:a', 'paCat:c']);
249+
});
250+
251+
it('should return original codes unchanged if they are not prefixed with "iptccat" or "service"', () => {
252+
expect(
253+
processFingerpostPACategoryCodes([
254+
'sport',
255+
'sport:uk',
256+
'paCat:sport:uk',
257+
]),
258+
).toEqual(['sport', 'sport:uk', 'paCat:sport:uk']);
259+
});
208260
});
261+
});
209262

210-
it('should return simple codes labelled "iptccat" as simple "paCat" codes', () => {
263+
describe('when mediaCatCodes is provided', () => {
264+
it('should append the mediaCatCode to the processed category codes', () => {
211265
expect(
212-
processFingerpostPACategoryCodes(['iptccat:a', 'iptccat:b']),
213-
).toEqual(['paCat:a', 'paCat:b']);
266+
processFingerpostPACategoryCodes(['iptccat:a'], 'exampleCategory'),
267+
).toEqual(['paCat:a', 'paCat:exampleCategory']);
214268
});
215269

216-
it('should expand category codes with multiple subcodes', () => {
217-
expect(processFingerpostPACategoryCodes(['iptccat:c+d'])).toEqual([
218-
'paCat:c',
219-
'paCat:d',
270+
it('should return only the mediaCatCode if no other category codes are provided', () => {
271+
expect(processFingerpostPACategoryCodes([], 'exampleCategory')).toEqual([
272+
'paCat:exampleCategory',
220273
]);
221274
});
222275

223-
it('should remove empty strings', () => {
224-
expect(
225-
processFingerpostPACategoryCodes(['iptccat:a', '', 'iptccat:c']),
226-
).toEqual(['paCat:a', 'paCat:c']);
276+
it('should return an empty array if no category codes or mediaCatCode are provided', () => {
277+
expect(processFingerpostPACategoryCodes([], undefined)).toEqual([]);
278+
expect(processFingerpostPACategoryCodes([], '')).toEqual([]);
279+
expect(processFingerpostPACategoryCodes([], ' ')).toEqual([]);
227280
});
228-
229-
it('should remove trailing and leading whitespace', () => {
230-
expect(
231-
processFingerpostPACategoryCodes([
232-
'iptccat:a ',
233-
' iptccat:c',
234-
' service:news ',
235-
'qCode:value ',
236-
'iptccat: ',
237-
]),
238-
).toEqual(['paCat:a', 'paCat:c', 'qCode:value']);
239-
});
240-
241-
it('should deduplicate category codes after stripping whitespace', () => {
242-
expect(
243-
processFingerpostPACategoryCodes([
244-
'iptccat:a ',
245-
' iptccat:a',
246-
'iptccat:c',
247-
]),
248-
).toEqual(['paCat:a', 'paCat:c']);
249-
});
250-
});
251-
});
252-
253-
describe('processFingerpostPAAPICategoryCodes', () => {
254-
it('should return an empty array if provided with an empty array', () => {
255-
expect(processFingerpostPAAPICategoryCodes([])).toEqual([]);
256-
});
257-
it('should append paCat if mediaCatCode is defined', () => {
258-
expect(processFingerpostPAAPICategoryCodes([], 'exampleCategory')).toEqual([
259-
'paCat:exampleCategory',
260-
]);
261-
});
262-
it('should return the array unchanged if original category codes are supplied', () => {
263-
expect(processFingerpostPAAPICategoryCodes(['hello', 'world'])).toEqual([
264-
'hello',
265-
'world',
266-
]);
267281
});
268282
});
269283

ingestion-lambda/src/categoryCodes.ts

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import nlp from 'compromise';
2+
import { partition } from '../partition';
23
import { worldTopicCodes } from '../topicCodes';
34
import {
45
alpha2CountriesMap,
@@ -140,28 +141,39 @@ export function processFingerpostAFPCategoryCodes(
140141
return deduped;
141142
}
142143

143-
export function processFingerpostPACategoryCodes(original: string[]) {
144+
export function processFingerpostPACategoryCodes(
145+
original: string[],
146+
maybeMediaCatCode?: string,
147+
): string[] {
144148
const notServiceCodes = original.filter((_) => !_.includes('service:'));
145-
146-
const transformedCategoryCodes = notServiceCodes
149+
const [iptcCatCodes, otherCodes] = partition(notServiceCodes, (_) =>
150+
_.includes('iptccat:'),
151+
);
152+
const transformedCategoryCodes = iptcCatCodes
147153
.flatMap((_) => unpackCategoryCodes(_, 'paCat'))
148154
.map((_) => replacePrefixesFromLookup(_, { iptccat: 'paCat' }))
149155
.map(categoryCodeToString);
150156

151-
const deduped = [...new Set(transformedCategoryCodes)];
157+
const maybeMediaCatCodes =
158+
maybeMediaCatCode !== undefined && maybeMediaCatCode.trim().length > 0
159+
? [`paCat:${maybeMediaCatCode.trim()}`]
160+
: [];
161+
162+
const trimmedOtherCodes = otherCodes
163+
.map((code) => code.trim())
164+
.filter((code) => code.length > 0);
165+
166+
const deduped = [
167+
...new Set([
168+
...transformedCategoryCodes,
169+
...maybeMediaCatCodes,
170+
...trimmedOtherCodes,
171+
]),
172+
];
152173

153174
return deduped;
154175
}
155176

156-
export function processFingerpostPAAPICategoryCodes(
157-
original: string[],
158-
mediaCatCodes?: string,
159-
) {
160-
if (mediaCatCodes) {
161-
return [...original, `paCat:${mediaCatCodes}`];
162-
}
163-
return [...original];
164-
}
165177
export function processUnknownFingerpostCategoryCodes(
166178
original: string[],
167179
supplier: string,

ingestion-lambda/src/processContentObject.test.ts

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import type { DataFormatInfo } from 'newswires-shared/index';
2+
import paApiDataFormattingFixture from '../test/fixtures/PA_API-Data-Formatting.json';
23
import paApiFixture from '../test/fixtures/PA_API.json';
34
import { cleanBodyTextMarkup } from './cleanMarkup';
45
import {
@@ -175,18 +176,38 @@ describe('processFingerpostJsonContent', () => {
175176
imageIds: [],
176177
keywords: [],
177178
},
178-
supplier: 'PAAPI',
179+
supplier: 'PA',
179180
guSourceFeed: 'PA_API',
180181
status: 'success',
181182
categoryCodes: [
183+
'paCat:SCN',
182184
'news',
183185
'news:uk',
184186
'politics',
185187
'news:scotland',
186-
'paCat:SCN',
187188
],
188189
});
189190
});
191+
192+
it('should process a PA API fixture with Data Formatting correctly', () => {
193+
const paApiWithDataFormatting = processFingerpostJsonContent(
194+
JSON.stringify(paApiDataFormattingFixture),
195+
);
196+
expect(paApiWithDataFormatting).toEqual({
197+
content: {
198+
...paApiDataFormattingFixture,
199+
body_text: cleanBodyTextMarkup(
200+
safeBodyParse(JSON.stringify(paApiDataFormattingFixture)).body_text!,
201+
),
202+
imageIds: [],
203+
keywords: [],
204+
},
205+
supplier: 'PA',
206+
guSourceFeed: 'PA_API DATA FORMATTING',
207+
status: 'success',
208+
categoryCodes: ['paCat:SFU', 'paCat:SCN'],
209+
});
210+
});
190211
});
191212

192213
describe('remapSourceFeeds', () => {

ingestion-lambda/src/processContentObject.ts

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ import {
1313
processFingerpostAAPCategoryCodes,
1414
processFingerpostAFPCategoryCodes,
1515
processFingerpostAPCategoryCodes,
16-
processFingerpostPAAPICategoryCodes,
1716
processFingerpostPACategoryCodes,
1817
processReutersDestinationCodes,
1918
processReutersTopicCodes,
@@ -69,14 +68,14 @@ export function processCategoryCodes({
6968
destinationCodes,
7069
bodyText,
7170
priority,
72-
mediaCatCodes,
71+
maybeMediaCatCode,
7372
}: {
7473
supplier: string;
7574
subjectCodes: string[];
7675
destinationCodes: string[];
7776
bodyText?: string;
7877
priority?: string;
79-
mediaCatCodes?: string;
78+
maybeMediaCatCode?: string;
8079
}) {
8180
const catCodes: string[] = priority === '1' ? ['HIGH_PRIORITY'] : [];
8281
const regionCodes = inferGeographicalCategoriesFromText(bodyText);
@@ -113,11 +112,9 @@ export function processCategoryCodes({
113112
...regionCodes,
114113
];
115114
case 'PA':
116-
return [...catCodes, ...processFingerpostPACategoryCodes(subjectCodes)];
117-
case 'PAAPI':
118115
return [
119116
...catCodes,
120-
...processFingerpostPAAPICategoryCodes(subjectCodes, mediaCatCodes),
117+
...processFingerpostPACategoryCodes(subjectCodes, maybeMediaCatCode),
121118
];
122119
case 'MINOR_AGENCIES': {
123120
const updatedSubjectCodes = [
@@ -254,25 +251,27 @@ export function processFingerpostJsonContent(
254251

255252
const supplier = lookupSupplier(content['source-feed']) ?? 'Unknown';
256253

254+
const guSourceFeed = remapSourceFeeds({
255+
sourceFeed: content['source-feed'],
256+
dataFormat: content.dataformat,
257+
subjectCodes: content.subjects?.code,
258+
});
259+
257260
const categoryCodes = dedupeStrings(
258261
processCategoryCodes({
259262
supplier,
260263
subjectCodes: content.subjects?.code ?? [],
261264
destinationCodes: content.destinations?.code ?? [],
262265
bodyText: `${content.headline ?? ''} ${content.abstract ?? ''} ${content.body_text}`,
263266
priority: content.priority,
264-
mediaCatCodes: content.mediaCatCodes,
267+
maybeMediaCatCode: content.mediaCatCodes, // the name in the original JSON is plural, but the value is always a single string, rather than an array, if it exists
265268
}),
266269
);
267270
return {
268271
status: 'success' as const,
269272
content,
270273
supplier,
271-
guSourceFeed: remapSourceFeeds({
272-
sourceFeed: content['source-feed'],
273-
dataFormat: content.dataformat,
274-
subjectCodes: content.subjects?.code,
275-
}),
274+
guSourceFeed,
276275
categoryCodes,
277276
};
278277
} catch (error) {

0 commit comments

Comments
 (0)