-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape-creator.php
More file actions
118 lines (94 loc) · 3.55 KB
/
scrape-creator.php
File metadata and controls
118 lines (94 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
<?php
use asset\ScrapedAsset;
use asset\StoredAssetQuery;
use creator\Creator;
use creator\CreatorLogic;
use log\LogLevel;
use database\Database;
use log\Log;
use log\LogResult;
use thumbnail\Thumbnail;
use misc\StringUtil;
require_once __DIR__ . '/../include/init.php';
// Show CLI syntax
if ($argc > 2 || in_array($argv[1] ?? '', ['-h', '--help', 'help', "h"])) {
echo "Usage: php scrape-creator.php [creator-slug|creator-id] [force]
Not setting a creator will pick a random one from the regular scraping targets.
The force argument disables graceful backoff on errors.\n";
exit(1);
}
// Start logging and determine the official run timestamp
$now = new DateTime();
$timestamp = $now->format('Y-m-d\TH-i-s-v');
Log::start(logName: "scrape-creator/" . $timestamp, writeToStdout: true);
// Determine backoff behavior
$force = isset($argv[2]) && strtolower($argv[2]) === 'force';
Log::write("Forceful mode?", $force, LogLevel::DEBUG);
// Pick a target creator
if (isset($argv[1])) {
$creator = Creator::fromValueOrSlug($argv[1]);
} else {
$creator = Creator::randomScrapingTarget(!$force);
}
/**
* @var Creator $creator
* @var CreatorLogic $creatorLogic */
$creatorLogic = $creator->getLogic();
Log::write("Loaded logic and starting to scrape for creator", $creator->name, LogLevel::INFO);
// Increment failure counter (will be reset on success)
$creator->incrementFailedAttempts($now);
// Get existing assets to provide a comparison
$query = new StoredAssetQuery();
$query->filterCreator = [$creator];
$query->filterStatus = NULL;
$query->limit = NULL;
$existingAssets = $query->execute();
Log::write("Found " . sizeof($existingAssets) . " existing assets for creator.", LogLevel::INFO);
// Get new assets using creator-specific method
// Passing in the list of existing URLs and
$newScrapedAssets = $creatorLogic->scrapeAssets($existingAssets);
// Perform post-processing on the results
foreach ($newScrapedAssets as $scrapedAsset) {
if ($scrapedAsset === null) {
continue;
}
// Expand and clean up the tag array
$titleWords = preg_split('/\s+/', $scrapedAsset->title);
if ($titleWords !== false) {
$scrapedAsset->tags = array_merge($scrapedAsset->tags, $titleWords);
}
$scrapedAsset->tags[] = $creator->slug();
$scrapedAsset->tags = StringUtil::filterTagArray($scrapedAsset->tags);
}
Log::write("Found " . sizeof($newScrapedAssets) . " new assets", $newScrapedAssets, LogLevel::INFO);
// Save new assets to DB
if (sizeof($newScrapedAssets) > 0) {
/**
* @var ScrapedAsset $newScrapedAsset
*/
foreach ($newScrapedAssets as $newScrapedAsset) {
// Validity checks
if ($newScrapedAsset->creator == null || $newScrapedAsset->creator !== $creator) {
Log::write("Skipping asset with mismatched creator: ", $newScrapedAsset, LogLevel::WARNING);
continue;
}
// Convert scraped asset to stored asset
$newStoredAsset = $newScrapedAsset->toStoredAsset();
// Save stored asset to DB
Database::startTransaction();
$newStoredAsset->writeToDatabase();
if ($newStoredAsset->id !== null && $newScrapedAsset->rawThumbnail !== null) {
Thumbnail::saveThumbnailVariations($newStoredAsset->id, $newScrapedAsset->rawThumbnail);
Log::write("Saved thumbnail for asset ", $newStoredAsset->id);
}
Database::commitTransaction();
Log::write("Committed new asset to DB", $newStoredAsset, LogLevel::INFO);
}
Log::write("Wrote " . sizeof($newScrapedAssets) . " new assets.", LogLevel::INFO);
} else {
Log::write("No new updates to write to DB.", LogLevel::INFO);
}
$creator->resetFailedAttempts($now);
Database::optimize();
Thumbnail::deleteOrphanedThumbnails();
Log::stop(true);