Skip to content

Commit 80afe1c

Browse files
committed
Fix rating_votes, trailer and cast item matches
1 parent a8afa8e commit 80afe1c

File tree

1 file changed

+118
-44
lines changed

1 file changed

+118
-44
lines changed

src/HtmlPieces.php

Lines changed: 118 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -27,27 +27,28 @@ public function get(object $page, string $element)
2727

2828
switch ($element) {
2929
case "title":
30-
$patterns = [".title_wrapper h1", "h1[data-testid=hero-title-block__title]"];
30+
$patterns = ["h1[data-testid=hero-title-block__title]", ".title_wrapper h1"];
3131
$title = $this->findMatchInPatterns($dom, $page, $patterns);
3232

3333
return $this->strClean($title);
3434
break;
3535

3636
case "year":
37-
$patterns = [".title_wrapper h1 #titleYear a", "section section div div div ul li a"];
37+
$patterns = ["section section div div div ul li a", ".title_wrapper h1 #titleYear a"];
3838
$year = $this->findMatchInPatterns($dom, $page, $patterns);
3939

4040
return $this->strClean($year);
4141
break;
4242

4343
case "length":
44-
$patterns = [".subtext time", "section section div div div ul li"];
44+
$patterns = ["section section div div div ul li", ".subtext time"];
4545
$length = "";
4646

47-
$length = $dom->find($page, $patterns[0])->text;
47+
$length = $dom->find($page, $patterns[1])->text;
4848
if ($this->count($length) > 0) return $this->strClean($length);
4949

50-
$iter = $dom->find($page, $patterns[1]);
50+
$length = "";
51+
$iter = $dom->find($page, $patterns[0]);
5152
if ($this->count($iter) === 0) return $length;
5253

5354
// Loop row below main title
@@ -69,28 +70,29 @@ public function get(object $page, string $element)
6970
break;
7071

7172
case "plot":
72-
$patterns = [".plot_summary .summary_text", "p[data-testid=plot] div"];
73+
$patterns = ["p[data-testid=plot] div", ".plot_summary .summary_text"];
7374
$plot = $this->findMatchInPatterns($dom, $page, $patterns);
7475

7576
return $this->strClean($plot);
7677
break;
7778

7879
case "rating":
79-
$patterns = [".ratings_wrapper .ratingValue span[itemprop=ratingValue]", "div[data-testid=hero-title-block__aggregate-rating__score]"];
80+
$patterns = ["main div[data-testid=hero-title-block__aggregate-rating__score]", ".ratings_wrapper .ratingValue span[itemprop=ratingValue]"];
8081
$rating = $this->findMatchInPatterns($dom, $page, $patterns);
8182

8283
return $this->strClean($rating);
8384
break;
8485

8586
case "rating_votes":
86-
$patterns = [".ratings_wrapper span[itemprop=ratingCount]", "div[class*=TotalRatingAmount]"];
87+
$patterns = ["main div[class*=TotalRatingAmount]", ".ratings_wrapper span[itemprop=ratingCount]"];
8788
$rating_votes = $this->findMatchInPatterns($dom, $page, $patterns);
89+
$rating_votes = $this->unwrapFormattedNumber($rating_votes);
8890

89-
return preg_replace("/[^0-9 ]/", "", $this->strClean($rating_votes));
91+
return preg_replace("/[^0-9]/", "", $this->strClean($rating_votes));
9092
break;
9193

9294
case "poster":
93-
$patterns = [".poster img", ".ipc-poster img"];
95+
$patterns = [".ipc-poster .ipc-media img", ".poster img"];
9496
$poster = $this->findMatchInPatterns($dom, $page, $patterns, "src");
9597
$poster = preg_match('/@/', $poster) ? preg_split('~@(?=[^@]*$)~', $poster)[0] . "@.jpg" : $poster;
9698

@@ -100,14 +102,19 @@ public function get(object $page, string $element)
100102
case "trailer":
101103
// section section div section section div div div div div a[aria-label^=Watch]
102104
// div a[class*=hero-media][aria-label^=Watch]
103-
$patterns = [".slate a[data-video]", "div a[aria-label^=Watch]"];
104-
$trailerLink = $dom->find($page, $patterns[1]);
105+
$patterns = ["div a[aria-label^=Watch]", ".slate a[data-video]"];
106+
$trailerLinkOld = $dom->find($page, $patterns[1]);
107+
$trailerLink = $dom->find($page, $patterns[0]);
105108

106109
if ($this->count($trailerLink)) {
107110
$href = $trailerLink->getAttribute("href");
108111
preg_match("/\/video\/(vi[a-zA-Z0-9]+)/", $href, $matches);
109112
$trailerId = $this->count($matches) > 1 ? $matches[1] : "";
110113
$trailerLink = $this->count($trailerId) ? "https://www.imdb.com/video/".$trailerId : "";
114+
115+
} elseif ($this->count($trailerLinkOld)) {
116+
$trailerId = $this->count($trailerLinkOld) ? $trailerLinkOld->getAttribute("data-video") : "";
117+
$trailerLink = $this->count($trailerId) ? "https://www.imdb.com/video/".$trailerId : "";
111118
} else {
112119
$trailerId = "";
113120
$trailerLink = "";
@@ -121,45 +128,82 @@ public function get(object $page, string $element)
121128

122129
case "cast":
123130
$cast = [];
131+
$findAllCastOld = $dom->find($page, 'table.cast_list tr');
124132
$findAllCast = $dom->find($page, 'section.title-cast div.title-cast__grid div');
125-
foreach ($findAllCast as $castRow)
126-
{
127-
if ($this->count($castRow->find('img')) === 0) {
128-
continue;
129-
}
130133

131-
$actor = [];
132-
$actor["actor"] = "";
133-
$actor["actor_id"] = "";
134-
$actor["character"] = "";
134+
// Use $findAllCastOld
135+
if ($this->count($findAllCastOld)) {
136+
foreach ($findAllCastOld as $castRow)
137+
{
138+
if ($this->count($castRow->find('.primary_photo')) === 0) {
139+
continue;
140+
}
141+
$actor = [];
142+
143+
$characterLink = $castRow->find('.character a');
144+
$actor["character"] = count($characterLink) ? $characterLink->text : $dom->find($castRow, '.character')->text;
145+
146+
$actorRow = $castRow->find('td')[1];
147+
$actorLink = $actorRow->find('a');
148+
if ($this->count($actorLink) > 0) {
149+
// Set actor name to text within link
150+
$actor["actor"] = $actorLink->text;
151+
$actor["actor_id"] = $this->extractImdbId($actorLink->href);
152+
} else {
153+
// No link found
154+
// Set actor name to whatever is there
155+
$actor["actor"] = $actorRow->text;
156+
}
135157

136-
// Actor
137-
$actorLink = $castRow->find('a[data-testid=title-cast-item__actor]');
138-
if ($this->count($actorLink)) {
139-
$actor["actor"] = $actorLink->text;
140-
}
158+
$actor["character"] = $this->strClean($actor["character"]);
159+
$actor["actor"] = $this->strClean($actor["actor"]);
160+
$actor["actor_id"] = $this->strClean($actor["actor_id"]);
141161

142-
// Actor ID
143-
$link = $castRow->find('a');
144-
if ($this->count($link)) {
145-
$href = $link->getAttribute("href");
146-
preg_match("/(nm[0-9]+)/", $href, $matches);
147-
if ($this->count($matches)) {
148-
$actor["actor_id"] = $matches[0];
149-
}
162+
array_push($cast, $actor);
150163
}
164+
}
151165

152-
// Character
153-
$characterLink = $castRow->find('a[data-testid=cast-item-characters-link]');
154-
if ($this->count($characterLink)) {
155-
$actor["character"] = $characterLink->text;
166+
// Use 'new' $findAllCast
167+
if ($this->count($findAllCast)) {
168+
foreach ($findAllCast as $castRow)
169+
{
170+
if ($this->count($castRow->find('img')) === 0) {
171+
continue;
172+
}
173+
174+
$actor = [];
175+
$actor["actor"] = "";
176+
$actor["actor_id"] = "";
177+
$actor["character"] = "";
178+
179+
// Actor
180+
$actorLink = $castRow->find('a[data-testid=title-cast-item__actor]');
181+
if ($this->count($actorLink)) {
182+
$actor["actor"] = $actorLink->text;
183+
}
184+
185+
// Actor ID
186+
$link = $castRow->find('a');
187+
if ($this->count($link)) {
188+
$href = $link->getAttribute("href");
189+
preg_match("/(nm[0-9]+)/", $href, $matches);
190+
if ($this->count($matches)) {
191+
$actor["actor_id"] = $matches[0];
192+
}
193+
}
194+
195+
// Character
196+
$characterLink = $castRow->find('a[data-testid=cast-item-characters-link]');
197+
if ($this->count($characterLink)) {
198+
$actor["character"] = $characterLink->text;
199+
}
200+
201+
$actor["character"] = $this->strClean($actor["character"]);
202+
$actor["actor"] = $this->strClean($actor["actor"]);
203+
$actor["actor_id"] = $this->strClean($actor["actor_id"]);
204+
205+
array_push($cast, $actor);
156206
}
157-
158-
$actor["character"] = $this->strClean($actor["character"]);
159-
$actor["actor"] = $this->strClean($actor["actor"]);
160-
$actor["actor_id"] = $this->strClean($actor["actor_id"]);
161-
162-
array_push($cast, $actor);
163207
}
164208
return $cast;
165209
break;
@@ -255,6 +299,36 @@ public function findMatchInPatterns(object $dom, object $page, array $patterns,
255299
return $str;
256300
}
257301

302+
/**
303+
* Unwrap formatted number to original int - 1.5K -> 1500
304+
*
305+
* @param string $str
306+
* @return string
307+
*/
308+
public function unwrapFormattedNumber($str)
309+
{
310+
$unwrap = $str;
311+
$divisors = ["K", "M", "B"];
312+
$divisorMap = [
313+
"K" => 1000,
314+
"M" => 1000000,
315+
"B" => 1000000000
316+
];
317+
318+
$strDivisor = substr($str, -1);
319+
if (in_array($strDivisor, $divisors)) {
320+
// Remove last charactor
321+
$strNum = substr($str, 0, -1);
322+
$num = floatval($strNum);
323+
324+
$numActual = $num * $divisorMap[$strDivisor];
325+
326+
$unwrap = strval($numActual);
327+
}
328+
329+
return $unwrap;
330+
}
331+
258332
/**
259333
* Extract an imdb-id from a string '/ttxxxxxxx/'
260334
* Returns string of id or empty string if none found

0 commit comments

Comments
 (0)