Skip to content

Commit b00f1eb

Browse files
committed
Release v1.0.3: Enhanced Search Pagination
1 parent 527876b commit b00f1eb

File tree

12 files changed

+268
-64
lines changed

12 files changed

+268
-64
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,21 @@
22

33
All notable changes to this project will be documented in this file.
44

5+
## [1.0.3] - 2025-10-15
6+
7+
### New Features
8+
9+
- **Enhanced Search Pagination**: Now able to fetch unlimited search results (300+) with automatic pagination, not limited to 50 results anymore
10+
- **Improved Search Performance**: Optimized search result fetching with better token handling and batch processing
11+
12+
### Bug Fixes & Code Quality Improvements
13+
14+
- **Code Review**: Addressed security vulnerabilities and code quality issues
15+
- **Error Handling**: Improved error handling patterns across all modules
16+
- **Performance**: Optimized JSON parsing and HTTP client fallback logic
17+
- **Security**: Fixed potential SSRF and injection vulnerabilities
18+
- **Maintainability**: Enhanced code readability and documentation
19+
520
## [1.0.2] - 2025-01-15
621

722
### Major Release - Complete Library Redesign 🚀

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
- Localized pricing and availability
4949
- Competitive analysis and benchmarking
5050

51-
## 🆕 **What's New in v1.0.2**
51+
## 🆕 **What's New in v1.0.3**
5252

5353
**✅ 7 Method Types:**
5454
- **App Methods** - Extract 65+ data fields from any app (ratings, installs, pricing, permissions, etc.)

docs/changelog.rst

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,23 @@ Changelog
33

44
All notable changes to this project will be documented in this file.
55

6-
Version 1.0.2 (2025-01-15)
6+
Version 1.0.3 (2025-10-15)
7+
---------------------------
8+
9+
**New Features**
10+
11+
- **Enhanced Search Pagination**: Now able to fetch unlimited search results (300+) with automatic pagination, not limited to 50 results anymore
12+
- **Improved Search Performance**: Optimized search result fetching with better token handling and batch processing
13+
14+
**Bug Fixes & Code Quality Improvements**
15+
16+
- **Code Review**: Addressed security vulnerabilities and code quality issues
17+
- **Error Handling**: Improved error handling patterns across all modules
18+
- **Performance**: Optimized JSON parsing and HTTP client fallback logic
19+
- **Security**: Fixed potential SSRF and injection vulnerabilities
20+
- **Maintainability**: Enhanced code readability and documentation
21+
22+
Version 1.0.2 (2025-10-15)
723
---------------------------
824

925
**Major Release - Complete Library Redesign** 🚀

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
project = 'GPlay Scraper'
99
copyright = '2025, Mohammed Cha'
1010
author = 'Mohammed Cha'
11-
release = '1.0.2'
11+
release = '1.0.3'
1212

1313
# Extensions
1414
extensions = [

gplay_scraper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
logging.getLogger(__name__).addHandler(logging.NullHandler())
3636

3737
# Package metadata
38-
__version__ = "1.0.2"
38+
__version__ = "1.0.3"
3939

4040
# Public API exports
4141
__all__ = [

gplay_scraper/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ class Config:
7878
"SUGGEST_NOT_FOUND": "Suggestions not found for: {term}",
7979
"NO_DS3_DATA": "No data found in dataset",
8080
"DS3_NOT_FOUND": "Could not find data",
81-
"DS3_JSON_PARSE_FAILED": "Failed to parse JSON: {error}"
81+
"DS3_JSON_PARSE_FAILED": "Failed to parse JSON: {error}",
82+
"SEARCH_PAGINATION_FAILED": "Failed to fetch paginated search results: {error}"
8283
}
8384

8485
@classmethod

gplay_scraper/core/gplay_methods.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def __init__(self, http_client: str = None):
142142
self.parser = SearchParser()
143143

144144
def search_analyze(self, query: str, count: int = Config.DEFAULT_SEARCH_COUNT, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY) -> List[Dict]:
145-
"""Search for apps and get complete results.
145+
"""Search for apps and get complete results with pagination support.
146146
147147
Args:
148148
query: Search query string
@@ -158,11 +158,11 @@ def search_analyze(self, query: str, count: int = Config.DEFAULT_SEARCH_COUNT, l
158158
"""
159159
if not query or not isinstance(query, str):
160160
raise InvalidAppIdError(Config.ERROR_MESSAGES["INVALID_QUERY"])
161-
162-
html_content = self.scraper.fetch_playstore_search(query, count, lang, country)
163-
dataset = self.scraper.scrape_play_store_data(html_content)
164-
raw_results = self.parser.parse_search_results(dataset, count)
165161

162+
# scrape_play_store_data now handles pagination automatically
163+
dataset = self.scraper.scrape_play_store_data(query, count, lang, country)
164+
165+
raw_results = self.parser.parse_search_results(dataset, count)
166166
return [self.parser.format_search_result(result) for result in raw_results]
167167

168168
def search_get_field(self, query: str, field: str, count: int = Config.DEFAULT_SEARCH_COUNT, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY) -> List[Any]:

gplay_scraper/core/gplay_parser.py

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import re
99
from datetime import datetime, timezone
1010
from typing import Dict, Any, List, Optional, Tuple
11-
from ..models.element_specs import ElementSpecs
11+
from ..models.element_specs import ElementSpecs, nested_lookup
1212
from ..utils.helpers import clean_json_string, alternative_json_clean, calculate_app_age, calculate_daily_installs, calculate_monthly_installs
1313
from ..config import Config
1414
from ..exceptions import DataParsingError
@@ -155,7 +155,6 @@ def parse_search_results(self, dataset: Dict, count: int) -> List[Dict]:
155155
if "ds:1" not in dataset:
156156
return []
157157

158-
from ..models.element_specs import nested_lookup
159158
search_data = nested_lookup(dataset.get("ds:1", {}), [0, 1, 0, 0, 0])
160159

161160
if not search_data:
@@ -209,6 +208,67 @@ def format_search_result(self, result: dict) -> dict:
209208
"free": result.get("free"),
210209
"url": result.get("url"),
211210
}
211+
212+
def extract_pagination_token(self, dataset: Dict) -> str:
213+
"""Extract pagination token from search dataset.
214+
215+
Args:
216+
dataset: Search dataset
217+
218+
Returns:
219+
Pagination token or None
220+
"""
221+
sections = nested_lookup(dataset.get("ds:1", {}), [0, 1, 0, 0])
222+
223+
if not sections:
224+
return None
225+
226+
for section in sections:
227+
if isinstance(section, list) and len(section) > 1:
228+
potential_token = nested_lookup(section, [1])
229+
if isinstance(potential_token, str):
230+
return potential_token
231+
return None
232+
233+
def parse_html_content(self, html_content: str) -> Dict:
234+
"""Extract datasets from search page HTML.
235+
236+
Args:
237+
html_content: HTML content of search page
238+
239+
Returns:
240+
Dictionary containing all datasets
241+
242+
Raises:
243+
DataParsingError: If no datasets found
244+
"""
245+
import re
246+
script_regex = re.compile(r"AF_initDataCallback[\s\S]*?</script")
247+
key_regex = re.compile(r"(ds:.*?)'")
248+
value_regex = re.compile(r"data:([\s\S]*?), sideChannel: \{\}\}\);</")
249+
250+
matches = script_regex.findall(html_content)
251+
dataset = {}
252+
253+
for match in matches:
254+
key_match = key_regex.findall(match)
255+
value_match = value_regex.findall(match)
256+
257+
if key_match and value_match:
258+
key = key_match[0]
259+
try:
260+
import json
261+
value = json.loads(value_match[0])
262+
dataset[key] = value
263+
except json.JSONDecodeError:
264+
continue
265+
266+
if not dataset:
267+
from ..exceptions import DataParsingError
268+
from ..config import Config
269+
raise DataParsingError("No search data found in HTML")
270+
271+
return dataset
212272

213273

214274
class ReviewsParser:
@@ -365,7 +425,6 @@ def parse_developer_data(self, dataset: Dict, dev_id: str) -> List[Dict]:
365425
else:
366426
apps_path = [0, 1, 0, 22, 0]
367427

368-
from ..models.element_specs import nested_lookup
369428
apps_data = nested_lookup(data.get("data", data), apps_path)
370429
if not apps_data:
371430
return []
@@ -438,7 +497,6 @@ def parse_similar_data(self, dataset: Dict) -> List[Dict]:
438497
except Exception:
439498
return []
440499

441-
from ..models.element_specs import nested_lookup
442500
apps_data = nested_lookup(data.get("data", data), [0, 1, 0, 21, 0])
443501
if not apps_data:
444502
return []
@@ -501,7 +559,6 @@ def parse_list_data(self, dataset: Dict, count: int) -> List[Dict]:
501559
if not collection_data:
502560
return []
503561

504-
from ..models.element_specs import nested_lookup
505562
apps_data = nested_lookup(collection_data, [0, 1, 0, 28, 0])
506563
if not apps_data:
507564
return []

gplay_scraper/core/gplay_scraper.py

Lines changed: 69 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -96,43 +96,88 @@ def fetch_playstore_search(self, query: str, count: int, lang: str = Config.DEFA
9696
if count <= 0:
9797
return ""
9898

99-
return self.http_client.fetch_search_page(query, lang, country)
99+
return self.http_client.fetch_search_page(query=query, lang=lang, country=country)
100100

101-
def scrape_play_store_data(self, html_content: str) -> Dict:
102-
"""Extract datasets from search page HTML.
101+
def scrape_play_store_data(self, query: str, count: int = Config.DEFAULT_SEARCH_COUNT, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY) -> Dict:
102+
"""Scrape search results with automatic pagination support.
103103
104104
Args:
105-
html_content: HTML content of search page
105+
query: Search query string
106+
count: Total number of results to fetch
107+
lang: Language code
108+
country: Country code
106109
107110
Returns:
108-
Dictionary containing all datasets
111+
Dictionary containing all search results
109112
110113
Raises:
111-
DataParsingError: If no datasets found
114+
DataParsingError: If parsing fails
112115
"""
113-
script_regex = re.compile(r"AF_initDataCallback[\s\S]*?</script")
114-
key_regex = re.compile(r"(ds:.*?)'")
115-
value_regex = re.compile(r"data:([\s\S]*?), sideChannel: {}}\);</")
116+
# Get initial search page
117+
html_content = self.fetch_playstore_search(query, count, lang, country)
116118

117-
matches = script_regex.findall(html_content)
118-
dataset = {}
119+
# Use parser for all parsing operations
120+
from .gplay_parser import SearchParser
121+
parser = SearchParser()
122+
dataset = parser.parse_html_content(html_content)
119123

120-
for match in matches:
121-
key_match = key_regex.findall(match)
122-
value_match = value_regex.findall(match)
123-
124-
if key_match and value_match:
125-
key = key_match[0]
126-
try:
127-
value = json.loads(value_match[0])
128-
dataset[key] = value
129-
except json.JSONDecodeError:
130-
continue
124+
# If count <= 20, return initial results
125+
if count <= 20:
126+
return dataset
127+
128+
# Extract pagination token
129+
token = parser.extract_pagination_token(dataset)
130+
131+
# Collect all results
132+
all_results = []
133+
initial_results = self._get_nested_value(dataset.get("ds:1", []), [0, 1, 0, 0, 0], [])
134+
all_results.extend(initial_results)
135+
136+
# Paginate if needed and token exists
137+
while len(all_results) < count and token:
138+
needed = min(100, count - len(all_results))
139+
try:
140+
response_text = self.http_client.fetch_search_page(token=token, needed=needed, lang=lang, country=country)
141+
data = json.loads(response_text[5:])
142+
parsed_data = json.loads(data[0][2])
143+
144+
if parsed_data:
145+
paginated_results = self._get_nested_value(parsed_data, [0, 0, 0], [])
146+
all_results.extend(paginated_results)
147+
# Update token for next iteration
148+
token = self._get_nested_value(parsed_data, [0, 0, 7, 1])
149+
else:
150+
break
151+
except (json.JSONDecodeError, IndexError, KeyError, Exception):
152+
break
131153

132-
if not dataset:
133-
raise DataParsingError(Config.ERROR_MESSAGES["NO_DS5_DATA"])
154+
# Update dataset with all results
155+
if "ds:1" in dataset:
156+
dataset["ds:1"][0][1][0][0][0] = all_results[:count]
134157

135158
return dataset
159+
160+
161+
162+
163+
164+
def _get_nested_value(self, data, path, default=None):
165+
"""Safely get nested value from data structure.
166+
167+
Args:
168+
data: Data structure to traverse
169+
path: List of keys/indices to follow
170+
default: Default value if path not found
171+
172+
Returns:
173+
Value at path or default
174+
"""
175+
try:
176+
for key in path:
177+
data = data[key]
178+
return data
179+
except (KeyError, IndexError, TypeError):
180+
return default
136181

137182

138183
class ReviewsScraper:

gplay_scraper/utils/helpers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,4 +230,4 @@ def calculate_monthly_installs(install_count, release_date_str: Optional[str], c
230230
return 0
231231

232232
months_since_release = days_since_release / 30.44
233-
return int(install_count / months_since_release)
233+
return int(install_count / months_since_release)

0 commit comments

Comments
 (0)