Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .eslintrc.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
extends: "eslint:recommended"
parserOptions:
ecmaVersion: 8
ecmaVersion: 2020
sourceType: "module"
env:
node: true
Expand Down
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
.idea
.DS_Store
node_modules
package-lock.json
npm-debug.log
coverage
test/e2e/results
.nyc-output
.nyc-output
112 changes: 94 additions & 18 deletions lib/request.js
Original file line number Diff line number Diff line change
@@ -1,31 +1,94 @@
import got from 'got';
import logger from './logger.js';
import { extend } from './utils/index.js';

function getMimeType (contentType) {
return contentType ? contentType.split(';')[0] : null;
}
import * as cheerio from 'cheerio';

function defaultResponseHandler ({response}) {
return Promise.resolve(response);
}

function extractEncodingFromHeader (headers) {
const contentTypeHeader = headers['content-type'];
function extractEncodingFromResponse (response) {
if (typeof response?.headers === 'object') {
const contentTypeHeader = response.headers['content-type'];

return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary';
}

return undefined;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we return undefined from this functions, but null from extractMimeTypeFromResponse? Maybe it will be better to return same resule (null or undefined) from both functions because they seem to be very similar.

}

function extractMimeTypeFromResponse (response) {
if (typeof response?.headers === 'object') {
const contentTypeHeader = response.headers['content-type'];

if (typeof contentTypeHeader === 'string') {
return contentTypeHeader.split(';')[0];
}
}

return null;
}

function extractEncodingFromHtmlResponse (response) {
try {
const body = getBodyAsString(response);

if (body) {
const $ = cheerio.load(body);
const charset = $('meta[charset]').attr('charset');

if (charset && charset.toLowerCase() === 'utf-8') {
return 'utf-8';
}
}

} catch (err) {
logger.error('Error parsing response html', response.url);
}

return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary';
return 'binary';
}

function extractEncodingFromCssResponse (response) {
try {
const body = getBodyAsString(response);

if (body && body.includes('@charset "UTF-8"')) {
return 'utf-8';
}

} catch (err) {
logger.error('Error parsing response html', response.url);
}

return 'binary';
}

function getEncodingByMime (response) {
switch (extractMimeTypeFromResponse(response)) {
case 'text/html':
return extractEncodingFromHtmlResponse(response);

case 'text/css':
return extractEncodingFromCssResponse(response);

default:
return null;
}
}

function getEncoding (response) {
let encoding = 'binary';

if (response && typeof response === 'object') {
if (response.headers && typeof response.headers === 'object') {
return extractEncodingFromHeader(response.headers);
} else if (response.encoding) {
return response.encoding;
encoding = response.encoding || extractEncodingFromResponse(response) || encoding;

if (encoding === 'binary') {
encoding = getEncodingByMime(response) || encoding;
}
}

return 'binary';
return encoding;
}

function throwTypeError (result) {
Expand All @@ -42,15 +105,15 @@ function throwTypeError (result) {

function getData (result) {
let data = result;
if (result && typeof result === 'object' && 'body' in result) {

if (result?.body !== undefined) {
data = result.body;
}

return data;
}

function transformResult (result) {
const encoding = getEncoding(result);
function getBodyAsString (result, encoding) {
const data = getData(result);

// Check for no data
Expand All @@ -61,17 +124,29 @@ function transformResult (result) {
// Then stringify it.
let body = null;
if (data instanceof Buffer) {
body = data.toString(encoding);
body = data.toString(encoding || 'binary');
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need || here? It looks like encoding will be always 'binary' even if it was not found in headers or file content

} else if (typeof data === 'string') {
body = data;
} else {
throwTypeError(result);
}

return body;
}

function transformResult (result) {
const encoding = getEncoding(result);
const data = getData(result);
const body = getBodyAsString(result, encoding);

if (data === null || body === null) {
return null;
}

return {
body,
encoding,
metadata: result.metadata || data.metadata || null
metadata: (result && result.metadata) || (data && data.metadata) || null
};
}

Expand All @@ -92,9 +167,10 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR
if (!responseHandlerResult) {
return null;
}

return {
url: response.url,
mimeType: getMimeType(response.headers['content-type']),
mimeType: extractMimeTypeFromResponse(response),
body: responseHandlerResult.body,
metadata: responseHandlerResult.metadata,
encoding: responseHandlerResult.encoding
Expand Down
Loading