44
55use Illuminate \Http \Client \ConnectionException ;
66use Illuminate \Http \Client \Response ;
7+ use Illuminate \Support \Arr ;
78use Illuminate \Support \Facades \Gate ;
89use Illuminate \Support \Facades \Http ;
910use Illuminate \Support \Str ;
1011use Vigilant \Core \Services \TeamService ;
1112use Vigilant \Crawler \Enums \State ;
1213use Vigilant \Crawler \Models \CrawledUrl ;
14+ use Vigilant \Crawler \Models \IgnoredUrl ;
1315use Vigilant \Crawler \Notifications \RatelimitedNotification ;
1416
1517class CrawlUrl
@@ -105,9 +107,28 @@ public function crawl(CrawledUrl $url, int $try = 0): void
105107 ];
106108 }
107109
110+ $ existingLinks = CrawledUrl::query ()
111+ ->where ('crawler_id ' , '= ' , $ url ->crawler_id )
112+ ->whereIn ('url_hash ' , Arr::pluck ($ queuedLinks , 'url_hash ' ))
113+ ->pluck ('url_hash ' )
114+ ->all ();
115+
116+ $ queuedLinks = array_filter ($ queuedLinks , function (array $ record ) use ($ existingLinks ): bool {
117+ return ! in_array ($ record ['url_hash ' ], $ existingLinks , true );
118+ });
119+
108120 if ($ queuedLinks !== []) {
109121 $ timestamp = now ();
110122 $ records = [];
123+ $ ignoredHashes = [];
124+
125+ if ($ url ->crawler_id !== null ) {
126+ $ ignoredHashes = IgnoredUrl::query ()
127+ ->where ('crawler_id ' , '= ' , $ url ->crawler_id )
128+ ->whereIn ('url_hash ' , array_keys ($ queuedLinks ))
129+ ->pluck ('url_hash ' )
130+ ->all ();
131+ }
111132
112133 foreach ($ queuedLinks as $ record ) {
113134 $ records [] = [
@@ -117,6 +138,7 @@ public function crawl(CrawledUrl $url, int $try = 0): void
117138 'url_hash ' => $ record ['url_hash ' ],
118139 'url ' => $ record ['url ' ],
119140 'found_on_id ' => $ record ['found_on_id ' ],
141+ 'ignored ' => in_array ($ record ['url_hash ' ], $ ignoredHashes , true ),
120142 'crawled ' => false ,
121143 'created_at ' => $ timestamp ,
122144 'updated_at ' => $ timestamp ,
0 commit comments