diff --git a/facebook_scraper/977c88d.diff.base64 (1).txt b/facebook_scraper/977c88d.diff.base64 (1).txt new file mode 100644 index 00000000..f6104fa6 --- /dev/null +++ b/facebook_scraper/977c88d.diff.base64 (1).txt @@ -0,0 +1 @@ +RnJvbSA5NzdjODhkODdkM2I5NmNmZWUwODJkN2Y4N2Y4ZTI4NGNkYWY4OWQ0IE1vbiBTZXAgMTcgMDA6MDA6MDAgMjAwMQpGcm9tOiBKaW5jaGVuZyBMaSA8amluY2hlbmcubGlAaW50ZWwuY29tPgpEYXRlOiBGcmksIDE1IE1hciAyMDI0IDE3OjE3OjU4ICswODAwClN1YmplY3Q6IFtQQVRDSF0gVVBTVFJFQU06IGFyY2gveDg2OiBGaXggdHlwbyBmb3IgbWFjcm8gQ1BVSURfRkVBVFVSRV9IVFQKCihjaGVycnkgcGlja2VkIGZyb20gY29tbWl0IGRjNjhhZGEzYTA2NmRjZGFiYTMyMjFiZjUxYTRmOThjOTRiZjFiOTgpCgpPcmlnaW5hbC1DaGFuZ2UtSWQ6IEk5YjI5MjMzZTc1NDgzY2RhNmJmNzcyM2NmNzk2MzJmNmIwNDIzM2IwCk9yaWdpbmFsLVNpZ25lZC1vZmYtYnk6IEppbmNoZW5nIExpIDxqaW5jaGVuZy5saUBpbnRlbC5jb20+Ck9yaWdpbmFsLVJldmlld2VkLW9uOiBodHRwczovL3Jldmlldy5jb3JlYm9vdC5vcmcvYy9jb3JlYm9vdC8rLzgxMjYwCk9yaWdpbmFsLVRlc3RlZC1ieTogYnVpbGQgYm90IChKZW5raW5zKSA8bm8tcmVwbHlAY29yZWJvb3Qub3JnPgpPcmlnaW5hbC1SZXZpZXdlZC1ieTogUGF0cmljayBSdWRvbHBoIDxwYXRyaWNrLnJ1ZG9scGhAOWVsZW1lbnRzLmNvbT4KR2l0T3JpZ2luLVJldklkOiBkYzY4YWRhM2EwNjZkY2RhYmEzMjIxYmY1MWE0Zjk4Yzk0YmYxYjk4CkNoYW5nZS1JZDogSWExZWE5Yzk3Njg4ZThkYzc5OTU5ZDE4ZDg1NGUzNjI5NGIyYWIxZDUKUmV2aWV3ZWQtb246IGh0dHBzOi8vY2hyb21pdW0tcmV2aWV3Lmdvb2dsZXNvdXJjZS5jb20vYy9jaHJvbWl1bW9zL3RoaXJkX3BhcnR5L2NvcmVib290LysvNTM4Nzg2NApSZXZpZXdlZC1ieTogTmljayBWYWNjYXJvIDxudmFjY2Fyb0Bnb29nbGUuY29tPgpUZXN0ZWQtYnk6IENocm9tZU9TIFByb2QgKFJvYm90KSA8Y2hyb21lb3MtY2ktcHJvZEBjaHJvbWVvcy1ib3QuaWFtLmdzZXJ2aWNlYWNjb3VudC5jb20+CkNvbW1pdC1RdWV1ZTogTmljayBWYWNjYXJvIDxudmFjY2Fyb0Bnb29nbGUuY29tPgotLS0KCmRpZmYgLS1naXQgYS9zcmMvYXJjaC94ODYvaW5jbHVkZS9hcmNoL2NwdS5oIGIvc3JjL2FyY2gveDg2L2luY2x1ZGUvYXJjaC9jcHUuaAppbmRleCBmYTBkNWY0Li5jZmZmYWMwIDEwMDY0NAotLS0gYS9zcmMvYXJjaC94ODYvaW5jbHVkZS9hcmNoL2NwdS5oCisrKyBiL3NyYy9hcmNoL3g4Ni9pbmNsdWRlL2FyY2gvY3B1LmgKQEAgLTQ5LDcgKzQ5LDcgQEAKIAogI2RlZmluZSBDUFVJRF9GRUFUVVJFX1BBRSAoMSA8PCA2KQogI2RlZmluZSBDUFVJRF9GRUFUVVJFX1BTRTM2ICgxIDw8IDE3KQotI2RlZmluZSBDUFVJRF9GRUFVUkVfSFRUICgxIDw8IDI4KQorI2RlZmluZSBDUFVJRF9GRUFUVVJFX0hUVCAoMSA8PCAyOCkKIAogLyogU3RydWN0dXJlZCBFeHRlbmRlZCBGZWF0dXJlIEZsYWdzICovCiAjZGVmaW5lIENQVUlEX1NUUlVDVF9FWFRFTkRFRF9GRUFUVVJFX0ZMQUdTIDB4NwpkaWZmIC0tZ2l0IGEvc3JjL2NwdS9pbnRlbC9jb21tb24vaHlwZXJ0aHJlYWRpbmcuYyBiL3NyYy9jcHUvaW50ZWwvY29tbW9uL2h5cGVydGhyZWFkaW5nLmMKaW5kZXggODVjNDQ3Ny4uZDAzNDEyOSAxMDA2NDQKLS0tIGEvc3JjL2NwdS9pbnRlbC9jb21tb24vaHlwZXJ0aHJlYWRpbmcuYworKysgYi9zcmMvY3B1L2ludGVsL2NvbW1vbi9oeXBlcnRocmVhZGluZy5jCkBAIC03LDcgKzcsNyBAQAogYm9vbCBpbnRlbF9odF9zdXBwb3J0ZWQodm9pZCkKIHsKIAkvKiBJcyBIeXBlclRocmVhZGluZyBzdXBwb3J0ZWQ/ICovCi0JcmV0dXJuICEhKGNwdWlkX2VkeCgxKSAmIENQVUlEX0ZFQVVSRV9IVFQpOworCXJldHVybiAhIShjcHVpZF9lZHgoMSkgJiBDUFVJRF9GRUFUVVJFX0hUVCk7CiB9CiAKIC8qCg== \ No newline at end of file diff --git a/facebook_scraper/googleb93efb0b1096dad5.html b/facebook_scraper/googleb93efb0b1096dad5.html new file mode 100644 index 00000000..1299c6cc --- /dev/null +++ b/facebook_scraper/googleb93efb0b1096dad5.html @@ -0,0 +1 @@ +google-site-verification: googleb93efb0b1096dad5.html \ No newline at end of file diff --git a/facebook_scraper/page_iterators.py b/facebook_scraper/page_iterators.py index 8cd7e4a0..49c60c60 100644 --- a/facebook_scraper/page_iterators.py +++ b/facebook_scraper/page_iterators.py @@ -4,6 +4,8 @@ import textwrap from typing import Iterator, Optional, Union +from requests.exceptions import HTTPError + from . import utils from .constants import FB_MOBILE_BASE_URL from .fb_types import URL, Page, RawPage, RequestFunction, Response @@ -12,9 +14,16 @@ logger = logging.getLogger(__name__) +class StartURLNotFound(Exception): + pass + + def iter_pages(account: str, request_fn: RequestFunction) -> Iterator[Page]: start_url = utils.urljoin(FB_MOBILE_BASE_URL, f'/{account}/') - return generic_iter_pages(start_url, PageParser, request_fn) + try: + return generic_iter_pages(start_url + 'posts/', PageParser, request_fn) + except StartURLNotFound: + return generic_iter_pages(start_url, PageParser, request_fn) def iter_group_pages(group: Union[str, int], request_fn: RequestFunction) -> Iterator[Page]: @@ -27,7 +36,12 @@ def generic_iter_pages(start_url, page_parser_cls, request_fn: RequestFunction) while next_url: logger.debug("Requesting page from: %s", next_url) - response = request_fn(next_url) + try: + response = request_fn(next_url) + except HTTPError as ex: + if ex.response and ex.response.status_code == 404 and next_url == start_url: + raise StartURLNotFound + raise logger.debug("Parsing page response") parser = page_parser_cls(response)