11<?php
22/** Crawl information of a specific application in the Google Play Store
33 * @class GooglePlay
4- * @version 1.0.1
4+ * @version 1.1.0
55 * @author Max Base & Izzy
66 * @copyright MIT https://github.com/BaseMax/GooglePlayWebServiceAPI/blob/master/LICENSE
77 * @log 2020-10-19 first release
@@ -141,20 +141,6 @@ public function parseApplication($packageName, $lang='en_US', $loc='US') {
141141 }
142142
143143 $ values ["developer " ] = strip_tags ($ this ->getRegVal ('/href="\/store\/apps\/dev(eloper)*\?id=(?<id>[^\"]+)"([^\>]*|)>(\<span[^\>]*>)*(?<content>[^\<]+)(<\/span>|)<\/a>/i ' ));
144-
145- preg_match ('/<a class="WpHeLc VfPpkd-mRLv6 VfPpkd-RLmnJb" href="\/store\/apps\/category\/(?<id>[^\"]+)" aria-label="(?<content>[^\"]+)"/i ' , $ this ->input , $ category );
146- if ( empty ($ category ) ) preg_match ('/href="\/store\/apps\/category\/(?<id>[^\"]+)" data-disable-idom="true" data-skip-focus-on-activate="false" jsshadow><span class="VfPpkd-N5Lhkf" jsname="bN97Pc"><span class="VfPpkd-jY41G-V67aGc" jsname="V67aGc">(?<content>[^\<]+)<\/span>/i ' , $ this ->input , $ category );
147- if (isset ($ category ["id " ], $ category ["content " ])) {
148- $ values ["category " ] = trim (strip_tags ($ category ["content " ]));
149- $ catId = trim (strip_tags ($ category ["id " ]));
150- if ($ catId =='GAME ' || substr ($ catId ,0 ,5 )=='GAME_ ' ) $ values ["type " ] = "game " ;
151- elseif ($ catId =='FAMILY ' || substr ($ catId ,0 ,7 )=='FAMILY? ' ) $ values ["type " ] = "family " ;
152- else $ values ["type " ] = "app " ;
153- } else {
154- $ values ["category " ] = null ;
155- $ values ["type " ] = null ;
156- }
157-
158144 $ values ["summary " ] = strip_tags ($ this ->getRegVal ('/property="og:description" content="(?<content>[^\"]+)/i ' ));
159145 $ values ["description " ] = $ this ->getRegVal ('/itemprop="description"[^\>]*><div class="bARER"[^\>]*>(?<content>.*?)<\/div><div class=/i ' );
160146 if ( strtolower (substr ($ lang ,0 ,2 )) != 'en ' ) { // Google sometimes keeps the EN description additionally, so we need to filter it out **TODO:** check if this still applies (2022-05-27)
@@ -204,6 +190,28 @@ public function parseApplication($packageName, $lang='en_US', $loc='US') {
204190 $ values ["votes " ] = $ this ->getRegVal ('/<div class="g1rdde">(?<content>[^>]+) reviews<\/div>/i ' );
205191 $ values ["price " ] = $ this ->getRegVal ('/<meta itemprop="price" content="(?<content>[^"]+)">/i ' );
206192
193+ // ld+json data, see https://github.com/BaseMax/GooglePlayWebServiceAPI/issues/22#issuecomment-1168397748
194+ $ d = new DomDocument ();
195+ @$ d ->loadHTML ($ this ->input );
196+ $ xp = new domxpath ($ d );
197+ $ jsonScripts = $ xp ->query ( '//script[@type="application/ld+json"] ' );
198+ $ json = trim ( @$ jsonScripts ->item (0 )->nodeValue ); //
199+ $ data = json_decode ($ json ,true );
200+ if (isset ($ data ['applicationCategory ' ])) {
201+ $ values ["category " ] = $ data ['applicationCategory ' ];
202+ if ( substr ($ values ["category " ],0 ,5 )=='GAME_ ' ) $ values ["type " ] = "game " ;
203+ elseif ( substr ($ values ["category " ],0 ,7 )=='FAMILY? ' ) $ values ["type " ] = "family " ;
204+ else $ values ["type " ] = "app " ;
205+ $ cats = $ this ->parseCategories ();
206+ if ( $ cats ["success " ] && !empty ($ cats ["data " ][$ values ["category " ]]) ) $ values ["category " ] = $ cats ["data " ][$ values ["category " ]]->name ;
207+ } else {
208+ $ values ["category " ] = null ;
209+ $ values ["type " ] = null ;
210+ }
211+ if ( empty ($ values ["summary " ]) && !empty ($ data ["description " ]) ) $ values ["summary " ] = $ data ["description " ];
212+ if (isset ($ data ["contentRating " ])) $ values ["contentRating " ] = $ data ["contentRating " ];
213+ else $ values ["contentRating " ] = "" ;
214+
207215 $ limit = 5 ; $ proto = '' ;
208216 while ( empty ($ proto ) && $ limit > 0 ) { // sometimes protobuf is missing, but present again on subsequent call
209217 $ proto = json_decode ($ this ->getRegVal ("/key: 'ds:4'. hash: '7'. data:(?<content>\[\[\[.+?). sideChannel: .*?\);<\/script/ims " )); // ds:8 hash:22 would have reviews
@@ -214,6 +222,7 @@ public function parseApplication($packageName, $lang='en_US', $loc='US') {
214222 if ( empty ($ values ["featureGraphic " ]) ) $ values ["featureGraphic " ] = $ proto [1 ][2 ][96 ][0 ][3 ][2 ];
215223 if ( empty ($ values ["video " ]) && !empty ($ proto [1 ][2 ][100 ]) ) $ values ["video " ] = $ proto [1 ][2 ][100 ][0 ][0 ][3 ][2 ];
216224 if ( empty ($ values ["summary " ]) && !empty ($ proto [1 ][2 ][73 ]) ) $ values ["summary " ] = $ proto [1 ][2 ][73 ][0 ][1 ]; // 1, 2, 73, 0, 1
225+ // category: $proto[1][2][79][0][0][0]; catId: $proto[1][2][79][0][0][2]
217226 // screenshots: 1,2,78,0,0-n; 1=format,2=[wid,hei],3.2=url
218227 // more details see: https://github.com/JoMingyu/google-play-scraper/blob/2caddd098b63736318a7725ff105907f397b9a48/google_play_scraper/constants/element.py
219228 break ;
0 commit comments