Skip to content

Commit 4a42cc8

Browse files
committed
ability to load stopwords from a PHP file or a TXT file
1 parent 939e698 commit 4a42cc8

File tree

4 files changed

+93
-11
lines changed

4 files changed

+93
-11
lines changed

README.md

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,36 @@ return [
315315
];
316316
```
317317

318-
In the example, the stemmer comes from the package [`wamania/php-stemmer`],
318+
In the example, the stemmer comes from the package
319+
[`wamania/php-stemmer`](https://github.com/wamania/php-stemmer),
319320
but any class with a `stem` method, or anything callable such as a closure, will do.
320321

321-
[`wamania/php-stemmer`]: https://github.com/wamania/php-stemmer
322+
As for stopwords, you can either list them directly in the config (as shown above),
323+
or load them from a file. The file can be either a TXT file (with one stopword per line),
324+
or a PHP file that returns an array:
325+
326+
```php
327+
// config/scout.php
328+
return [
329+
// ...
330+
'sqlout' => [
331+
// ...
332+
'stopwords' => 'storage/app/stopwords/fr.php',
333+
// ...
334+
],
335+
];
336+
```
337+
338+
```php
339+
// storage/app/stopwords/fr.php
340+
return [
341+
'à',
342+
'le',
343+
'la',
344+
];
345+
```
346+
347+
You may want to use the package [`voku/stop-words`](https://github.com/voku/stop-words),
348+
which provides collections of stopwords for various languages as PHP files,
349+
or [`yooper/stop-words`](https://github.com/yooper/stop-words),
350+
which provides them as TXT files.

composer.json

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,12 @@
1313
"laravel/scout": "^9.0|^10.0"
1414
},
1515
"require-dev": {
16+
"orchestra/testbench": "^6.23|^7.0|^8.0|^9.0|^10.0",
17+
"phpunit/phpunit": "^9.0|^10.0|^11.0",
1618
"squizlabs/php_codesniffer": "^3.7",
1719
"wamania/php-stemmer": "^3.0|^4.0",
18-
"orchestra/testbench": "^6.23|^7.0|^8.0|^9.0|^10.0"
20+
"voku/stop-words": "^2.0",
21+
"yooper/stop-words": "^1.0"
1922
},
2023
"autoload": {
2124
"files": [
@@ -31,7 +34,9 @@
3134
}
3235
},
3336
"suggest": {
34-
"wamania/php-stemmer": "PHP stemmer that can be used together with Sqlout."
37+
"wamania/php-stemmer": "PHP stemmer that can be used together with Sqlout.",
38+
"voku/stop-words": "A collection of stop words for various languages.",
39+
"yooper/stop-words": "Another collection of stop words."
3540
},
3641
"extra": {
3742
"laravel": {

src/Engine.php

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ protected function processString($content)
4141

4242
// Remove stopwords & short words:
4343
$minLength = config('scout.sqlout.minimum_length', 0);
44-
$stopwords = config('scout.sqlout.stopwords', []);
44+
$stopwords = $this->loadStopWords();
4545
$words = (new Collection($words))->reject(function ($word) use ($minLength, $stopwords) {
4646
return mb_strlen($word) < $minLength || in_array($word, $stopwords);
4747
})->all();
@@ -69,6 +69,32 @@ protected function processString($content)
6969
return implode(' ', $words);
7070
}
7171

72+
protected function loadStopWords()
73+
{
74+
$stopwords = config('scout.sqlout.stopwords', []);
75+
if (is_iterable($stopwords)) {
76+
return $stopwords;
77+
}
78+
79+
$file = $stopwords;
80+
if (!file_exists($file)) {
81+
throw new Exception("Can't import stop words from $file");
82+
}
83+
84+
$stream = fopen($file, 'r');
85+
$firstline = trim(fgets($stream));
86+
87+
if (trim($firstline) == '<?php') {
88+
return require $file;
89+
}
90+
91+
$stopwords = [$firstline];
92+
while (false !== ($word = fgets($stream))) {
93+
$stopwords[] = trim($word);
94+
}
95+
return $stopwords;
96+
}
97+
7298
/**
7399
* Update the given model in the index.
74100
*

tests/SearchTest.php

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -242,18 +242,40 @@ public function test_filters()
242242
$this->assertEquals('salut ça boume ?', $indexed);
243243
}
244244

245-
public function test_stopwords()
245+
/**
246+
* @dataProvider stopWordsProvider
247+
*/
248+
public function test_stopwords($config, $content, $expectedIndexedContent)
246249
{
247-
app('config')->set('scout.sqlout.stopwords', [
248-
'fuck',
249-
]);
250+
app('config')->set('scout.sqlout.stopwords', $config);
250251

251252
$post = Post::first();
252-
$post->body = 'shut the fuck up donny';
253+
$post->body = $content;
253254
$post->save();
254255

255256
$indexed = $this->newSearchQuery()->where('record_type', Post::class)->where('record_id', $post->id)->where('field', 'body')->value('content');
256-
$this->assertEquals('shut the up donny', $indexed);
257+
$this->assertEquals($expectedIndexedContent, $indexed);
258+
}
259+
260+
public static function stopWordsProvider()
261+
{
262+
return [
263+
'array' => [
264+
['fuck'],
265+
'shut the fuck up donny',
266+
'shut the up donny',
267+
],
268+
'PHP file' => [
269+
'vendor/voku/stop-words/src/voku/helper/stopwords/fr.php',
270+
'banco charlie alpha bravo',
271+
'charlie alpha bravo',
272+
],
273+
'TXT file' => [
274+
'vendor/yooper/stop-words/data/stop-words_french_1_fr.txt',
275+
'bigre boum tsoin brrr kiki',
276+
'kiki',
277+
],
278+
];
257279
}
258280

259281
public function test_minimum_length()

0 commit comments

Comments
 (0)