Skip to content

Commit a447308

Browse files
committed
fix category parsing (thanks @andaroid!) and add contentRating
1 parent 0d6dafb commit a447308

File tree

1 file changed

+24
-15
lines changed

1 file changed

+24
-15
lines changed

google-play.php

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<?php
22
/** Crawl information of a specific application in the Google Play Store
33
* @class GooglePlay
4-
* @version 1.0.1
4+
* @version 1.1.0
55
* @author Max Base & Izzy
66
* @copyright MIT https://github.com/BaseMax/GooglePlayWebServiceAPI/blob/master/LICENSE
77
* @log 2020-10-19 first release
@@ -141,20 +141,6 @@ public function parseApplication($packageName, $lang='en_US', $loc='US') {
141141
}
142142

143143
$values["developer"] = strip_tags($this->getRegVal('/href="\/store\/apps\/dev(eloper)*\?id=(?<id>[^\"]+)"([^\>]*|)>(\<span[^\>]*>)*(?<content>[^\<]+)(<\/span>|)<\/a>/i'));
144-
145-
preg_match('/<a class="WpHeLc VfPpkd-mRLv6 VfPpkd-RLmnJb" href="\/store\/apps\/category\/(?<id>[^\"]+)" aria-label="(?<content>[^\"]+)"/i', $this->input, $category);
146-
if ( empty($category) ) preg_match('/href="\/store\/apps\/category\/(?<id>[^\"]+)" data-disable-idom="true" data-skip-focus-on-activate="false" jsshadow><span class="VfPpkd-N5Lhkf" jsname="bN97Pc"><span class="VfPpkd-jY41G-V67aGc" jsname="V67aGc">(?<content>[^\<]+)<\/span>/i', $this->input, $category);
147-
if (isset($category["id"], $category["content"])) {
148-
$values["category"] = trim(strip_tags($category["content"]));
149-
$catId = trim(strip_tags($category["id"]));
150-
if ($catId=='GAME' || substr($catId,0,5)=='GAME_') $values["type"] = "game";
151-
elseif ($catId=='FAMILY' || substr($catId,0,7)=='FAMILY?') $values["type"] = "family";
152-
else $values["type"] = "app";
153-
} else {
154-
$values["category"] = null;
155-
$values["type"] = null;
156-
}
157-
158144
$values["summary"] = strip_tags($this->getRegVal('/property="og:description" content="(?<content>[^\"]+)/i'));
159145
$values["description"] = $this->getRegVal('/itemprop="description"[^\>]*><div class="bARER"[^\>]*>(?<content>.*?)<\/div><div class=/i');
160146
if ( strtolower(substr($lang,0,2)) != 'en' ) { // Google sometimes keeps the EN description additionally, so we need to filter it out **TODO:** check if this still applies (2022-05-27)
@@ -204,6 +190,28 @@ public function parseApplication($packageName, $lang='en_US', $loc='US') {
204190
$values["votes"] = $this->getRegVal('/<div class="g1rdde">(?<content>[^>]+) reviews<\/div>/i');
205191
$values["price"] = $this->getRegVal('/<meta itemprop="price" content="(?<content>[^"]+)">/i');
206192

193+
// ld+json data, see https://github.com/BaseMax/GooglePlayWebServiceAPI/issues/22#issuecomment-1168397748
194+
$d = new DomDocument();
195+
@$d->loadHTML($this->input);
196+
$xp = new domxpath($d);
197+
$jsonScripts = $xp->query( '//script[@type="application/ld+json"]' );
198+
$json = trim( @$jsonScripts->item(0)->nodeValue ); //
199+
$data = json_decode($json,true);
200+
if (isset($data['applicationCategory'])) {
201+
$values["category"] = $data['applicationCategory'];
202+
if ( substr($values["category"],0,5)=='GAME_' ) $values["type"] = "game";
203+
elseif ( substr($values["category"],0,7)=='FAMILY?' ) $values["type"] = "family";
204+
else $values["type"] = "app";
205+
$cats = $this->parseCategories();
206+
if ( $cats["success"] && !empty($cats["data"][$values["category"]]) ) $values["category"] = $cats["data"][$values["category"]]->name;
207+
} else {
208+
$values["category"] = null;
209+
$values["type"] = null;
210+
}
211+
if ( empty($values["summary"]) && !empty($data["description"]) ) $values["summary"] = $data["description"];
212+
if (isset($data["contentRating"])) $values["contentRating"] = $data["contentRating"];
213+
else $values["contentRating"] = "";
214+
207215
$limit = 5; $proto = '';
208216
while ( empty($proto) && $limit > 0 ) { // sometimes protobuf is missing, but present again on subsequent call
209217
$proto = json_decode($this->getRegVal("/key: 'ds:4'. hash: '7'. data:(?<content>\[\[\[.+?). sideChannel: .*?\);<\/script/ims")); // ds:8 hash:22 would have reviews
@@ -214,6 +222,7 @@ public function parseApplication($packageName, $lang='en_US', $loc='US') {
214222
if ( empty($values["featureGraphic"]) ) $values["featureGraphic"] = $proto[1][2][96][0][3][2];
215223
if ( empty($values["video"]) && !empty($proto[1][2][100]) ) $values["video"] = $proto[1][2][100][0][0][3][2];
216224
if ( empty($values["summary"]) && !empty($proto[1][2][73]) ) $values["summary"] = $proto[1][2][73][0][1]; // 1, 2, 73, 0, 1
225+
// category: $proto[1][2][79][0][0][0]; catId: $proto[1][2][79][0][0][2]
217226
// screenshots: 1,2,78,0,0-n; 1=format,2=[wid,hei],3.2=url
218227
// more details see: https://github.com/JoMingyu/google-play-scraper/blob/2caddd098b63736318a7725ff105907f397b9a48/google_play_scraper/constants/element.py
219228
break;

0 commit comments

Comments
 (0)