|
8 | 8 | import re |
9 | 9 | from datetime import datetime, timezone |
10 | 10 | from typing import Dict, Any, List, Optional, Tuple |
11 | | -from ..models.element_specs import ElementSpecs |
| 11 | +from ..models.element_specs import ElementSpecs, nested_lookup |
12 | 12 | from ..utils.helpers import clean_json_string, alternative_json_clean, calculate_app_age, calculate_daily_installs, calculate_monthly_installs |
13 | 13 | from ..config import Config |
14 | 14 | from ..exceptions import DataParsingError |
@@ -155,7 +155,6 @@ def parse_search_results(self, dataset: Dict, count: int) -> List[Dict]: |
155 | 155 | if "ds:1" not in dataset: |
156 | 156 | return [] |
157 | 157 |
|
158 | | - from ..models.element_specs import nested_lookup |
159 | 158 | search_data = nested_lookup(dataset.get("ds:1", {}), [0, 1, 0, 0, 0]) |
160 | 159 |
|
161 | 160 | if not search_data: |
@@ -209,6 +208,67 @@ def format_search_result(self, result: dict) -> dict: |
209 | 208 | "free": result.get("free"), |
210 | 209 | "url": result.get("url"), |
211 | 210 | } |
| 211 | + |
| 212 | + def extract_pagination_token(self, dataset: Dict) -> str: |
| 213 | + """Extract pagination token from search dataset. |
| 214 | + |
| 215 | + Args: |
| 216 | + dataset: Search dataset |
| 217 | + |
| 218 | + Returns: |
| 219 | + Pagination token or None |
| 220 | + """ |
| 221 | + sections = nested_lookup(dataset.get("ds:1", {}), [0, 1, 0, 0]) |
| 222 | + |
| 223 | + if not sections: |
| 224 | + return None |
| 225 | + |
| 226 | + for section in sections: |
| 227 | + if isinstance(section, list) and len(section) > 1: |
| 228 | + potential_token = nested_lookup(section, [1]) |
| 229 | + if isinstance(potential_token, str): |
| 230 | + return potential_token |
| 231 | + return None |
| 232 | + |
| 233 | + def parse_html_content(self, html_content: str) -> Dict: |
| 234 | + """Extract datasets from search page HTML. |
| 235 | + |
| 236 | + Args: |
| 237 | + html_content: HTML content of search page |
| 238 | + |
| 239 | + Returns: |
| 240 | + Dictionary containing all datasets |
| 241 | + |
| 242 | + Raises: |
| 243 | + DataParsingError: If no datasets found |
| 244 | + """ |
| 245 | + import re |
| 246 | + script_regex = re.compile(r"AF_initDataCallback[\s\S]*?</script") |
| 247 | + key_regex = re.compile(r"(ds:.*?)'") |
| 248 | + value_regex = re.compile(r"data:([\s\S]*?), sideChannel: \{\}\}\);</") |
| 249 | + |
| 250 | + matches = script_regex.findall(html_content) |
| 251 | + dataset = {} |
| 252 | + |
| 253 | + for match in matches: |
| 254 | + key_match = key_regex.findall(match) |
| 255 | + value_match = value_regex.findall(match) |
| 256 | + |
| 257 | + if key_match and value_match: |
| 258 | + key = key_match[0] |
| 259 | + try: |
| 260 | + import json |
| 261 | + value = json.loads(value_match[0]) |
| 262 | + dataset[key] = value |
| 263 | + except json.JSONDecodeError: |
| 264 | + continue |
| 265 | + |
| 266 | + if not dataset: |
| 267 | + from ..exceptions import DataParsingError |
| 268 | + from ..config import Config |
| 269 | + raise DataParsingError("No search data found in HTML") |
| 270 | + |
| 271 | + return dataset |
212 | 272 |
|
213 | 273 |
|
214 | 274 | class ReviewsParser: |
@@ -365,7 +425,6 @@ def parse_developer_data(self, dataset: Dict, dev_id: str) -> List[Dict]: |
365 | 425 | else: |
366 | 426 | apps_path = [0, 1, 0, 22, 0] |
367 | 427 |
|
368 | | - from ..models.element_specs import nested_lookup |
369 | 428 | apps_data = nested_lookup(data.get("data", data), apps_path) |
370 | 429 | if not apps_data: |
371 | 430 | return [] |
@@ -438,7 +497,6 @@ def parse_similar_data(self, dataset: Dict) -> List[Dict]: |
438 | 497 | except Exception: |
439 | 498 | return [] |
440 | 499 |
|
441 | | - from ..models.element_specs import nested_lookup |
442 | 500 | apps_data = nested_lookup(data.get("data", data), [0, 1, 0, 21, 0]) |
443 | 501 | if not apps_data: |
444 | 502 | return [] |
@@ -501,7 +559,6 @@ def parse_list_data(self, dataset: Dict, count: int) -> List[Dict]: |
501 | 559 | if not collection_data: |
502 | 560 | return [] |
503 | 561 |
|
504 | | - from ..models.element_specs import nested_lookup |
505 | 562 | apps_data = nested_lookup(collection_data, [0, 1, 0, 28, 0]) |
506 | 563 | if not apps_data: |
507 | 564 | return [] |
|
0 commit comments