Skip to content

Commit 71e6e42

Browse files
committed
added summarization and bug fixes
1 parent ca2e048 commit 71e6e42

File tree

11 files changed

+185
-68
lines changed

11 files changed

+185
-68
lines changed

corenlp.json

Whitespace-only changes.

src/CoreNlp.php

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
* unzip stanford-corenlp-full-2018-02-27.zip
88
* cd stanford-corenlp-full-2018-02-27
99
* java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
10+
* -> -serverProperties StanfordCoreNLP-chinese.properties
1011
*/
1112
class CoreNlp
1213
{
@@ -81,12 +82,13 @@ public function post_call( $text )
8182

8283
$url = $this->api_url;
8384
$url .= "?properties=" . urlencode( json_encode( $this->properties ) );
84-
echo "URL: {$url} \n\n";
85+
86+
//echo "URL: {$url} \n\n";
8587

8688
$context = stream_context_create($opts);
8789
$result = @file_get_contents($url, false, $context);
8890

89-
file_put_contents("corenlp.json", $result);
91+
//file_put_contents("corenlp.json", $result);
9092

9193
return json_decode($result, 1);
9294
}

src/MsConceptGraph.php

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,14 @@ public function call( $url )
5151
if ( $this->debug ) echo "URL: $url \n";
5252

5353
$context_params = array(
54-
'http' => array(
54+
'http' => array(
5555
'method' => 'GET',
5656
'header' => "Content-Type: application/json\r\n",
57-
)
57+
),
58+
"ssl"=>array(
59+
"verify_peer"=>false,
60+
"verify_peer_name"=>false,
61+
),
5862
);
5963

6064
$context = stream_context_create( $context_params );

src/NlpClient.php

Lines changed: 87 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
namespace Web64\Nlp;
44

55
/**
6-
* Simple interface to the Web64 NLP-Server for Natural Language Processing tasks
6+
* Simple interface to the Web64 NLP-Server (https://github.com/web64/nlpserver) for Natural Language Processing tasks
77
*/
88

99
class NlpClient{
@@ -26,73 +26,43 @@ function __construct( $hosts, $debug = false )
2626
else
2727
$this->addHost( $hosts );
2828

29-
3029
// pick random host as default
31-
$this->api_url = $this->api_hosts[
32-
array_rand( $this->api_hosts )
33-
];
34-
}
35-
36-
public function addHost( $host )
37-
{
38-
$host = rtrim( $host , '/');
39-
40-
if ( array_search($host, $this->api_hosts) === false)
41-
$this->api_hosts[] = $host;
30+
$this->api_url = $this->api_hosts[ array_rand( $this->api_hosts ) ];
4231
}
43-
44-
// debug message
45-
private function msg( $value )
46-
{
47-
if ( $this->debug )
48-
{
49-
if ( is_array($value) )
50-
{
51-
print_r( $value );
52-
echo PHP_EOL;
53-
}
54-
else
55-
echo $value . PHP_EOL;
56-
}
57-
}
58-
59-
// find working host
60-
private function chooseHost()
32+
33+
/**
34+
* Spacy.io Entity Extraction
35+
*/
36+
public function spacy_entities( $text, $lang = 'en' )
6137
{
62-
$random_a = $this->api_hosts;
63-
shuffle($random_a); // pick random host
38+
$data = $this->post_call('/spacy/entities', ['text' => $text, 'lang' => $lang ] );
6439

65-
foreach( $random_a as $api_url )
66-
{
67-
$this->msg( "chooseHost() - Testing: $api_url ");
68-
69-
$content = @file_get_contents( $api_url );
70-
if ( empty( $content ) )
71-
{
40+
return ( !empty($data['entities']) ) ? $data['entities'] : null;
41+
}
7242

73-
$this->msg( $content );
74-
// Failed
75-
$this->msg( "- Ignoring failed API URL: $api_url " );
76-
//print_r( $http_response_header );
77-
}else{
78-
$this->api_url = $api_url;
79-
$this->msg( "- Working API URL: $api_url" );
80-
return true;
81-
82-
}
83-
$this->msg( $content );
84-
}
43+
/**
44+
* Summarize long text
45+
*/
46+
public function summarize( $text, $word_count = null )
47+
{
48+
$data = $this->post_call('/summarize', ['text' => $text, 'word_count' => $word_count ] );
8549

86-
return false;
50+
return ( !empty($data['summary']) ) ? $data['summary'] : null;
8751
}
88-
52+
53+
/**
54+
* Article Extraction from HTML
55+
*/
8956
public function newspaperHtml( $html )
9057
{
91-
$data = $this->post_call('/newspaper', ['text' => $html ] );
58+
$data = $this->post_call('/newspaper', ['html' => $html ] );
9259

9360
return ( !empty($data['newspaper']) ) ? $data['newspaper'] : null;
9461
}
9562

63+
/**
64+
* Article Extraction from URL
65+
*/
9666
public function newspaperUrl( $url )
9767
{
9868
$data = $this->get_call('/newspaper', ['url' => $url ] );
@@ -101,6 +71,9 @@ public function newspaperUrl( $url )
10171
}
10272

10373

74+
/**
75+
* Get neighbouring words
76+
*/
10477
public function embeddings( $word, $lang = 'en')
10578
{
10679
$data = $this->get_call('/embeddings', ['word' => $word, 'lang' => $lang ] );
@@ -109,7 +82,7 @@ public function embeddings( $word, $lang = 'en')
10982
}
11083

11184
/**
112-
* Get entities and sentiment analysis of text
85+
* Get entities and sentiment analysis of text
11386
*/
11487
public function polyglot( $text, $language = null )
11588
{
@@ -196,4 +169,62 @@ public function get_call($path, $params, $retry = 0)
196169
return json_decode($result, 1);
197170

198171
}
172+
173+
/**
174+
* Internals
175+
*/
176+
177+
public function addHost( $host )
178+
{
179+
$host = rtrim( $host , '/');
180+
181+
if ( array_search($host, $this->api_hosts) === false)
182+
$this->api_hosts[] = $host;
183+
}
184+
185+
// debug message
186+
private function msg( $value )
187+
{
188+
if ( $this->debug )
189+
{
190+
if ( is_array($value) )
191+
{
192+
print_r( $value );
193+
echo PHP_EOL;
194+
}
195+
else
196+
echo $value . PHP_EOL;
197+
}
198+
}
199+
200+
// find working host
201+
private function chooseHost()
202+
{
203+
$random_a = $this->api_hosts;
204+
shuffle($random_a); // pick random host
205+
206+
foreach( $random_a as $api_url )
207+
{
208+
$this->msg( "chooseHost() - Testing: $api_url ");
209+
210+
$content = @file_get_contents( $api_url );
211+
if ( empty( $content ) )
212+
{
213+
214+
$this->msg( $content );
215+
// Failed
216+
$this->msg( "- Ignoring failed API URL: $api_url " );
217+
//print_r( $http_response_header );
218+
}else{
219+
$this->api_url = $api_url;
220+
$this->msg( "- Working API URL: $api_url" );
221+
return true;
222+
223+
}
224+
$this->msg( $content );
225+
}
226+
227+
return false;
228+
}
229+
199230
}

tests/TestCase.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ public function setUp()
1515
'http://localhost:6400/',
1616
'http://localhost:6400/',
1717
],
18-
'debug' => true,
18+
'debug' => false,
1919
];
2020
}
2121

tests/Unit/CoreNlpTest.php

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,12 @@ public function test_core_nlp()
2424
Demirci reportedly worked as a Germany correspondent for Etha and lived in the city of Cologn.
2525
News of Demirci's arrest was first reported by Mesale Tolu, a colleague of his at Etha. Tolu herself was one of several German nationals arrested in Turkey year last year for political reasons. Tolu tweeted on Friday morning that Demirci was detained with two colleagues, Pınar Gayip and Semiha Sahin, during a raid. Demirci, who reportedly holds both German and Turkish citizenship, was in Istanbul on holiday, Tolu said. Reports suggested he was scheduled to fly back to Germany on Saturday.
2626
";
27-
echo $text . PHP_EOL. PHP_EOL;
27+
//echo $text . PHP_EOL. PHP_EOL;
2828
$entities = $corenlp->entities( $text );
29-
print_r( $entities );
29+
//print_r( $entities );
3030

3131

3232
$this->assertNotEmpty( $entities['COUNTRY'] );
33+
$this->assertNotEmpty( $entities['NATIONALITY'] );
3334
}
3435
}

tests/Unit/EmbeddingsTest.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ public function get_neighbours()
1313

1414
$neighbours = $nlp->embeddings('obama', 'no');
1515

16-
$this->msg( $neighbours );
16+
//$this->msg( $neighbours );
17+
1718
$this->assertNotEmpty($neighbours);
1819
}
1920
}

tests/Unit/LanguageDetectonTest.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ public function not_enough_text()
6363
$nlp = new \Web64\Nlp\NlpClient( $this->nlpserver_config['hosts'], $this->nlpserver_config['debug'] );
6464

6565
$detected_lang = $nlp->language( "?" );
66-
$this->msg( "Detected: lang:". $detected_lang );
66+
//$this->msg( "Detected: lang:". $detected_lang );
6767
$this->assertEquals('en', $detected_lang);;
6868
}
6969

tests/Unit/NewspaperTest.php

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,29 @@
66

77
class NewspaperTest extends TestCase
88
{
9+
// /** @test */
10+
// public function url_article_extraction()
11+
// {
12+
// $nlp = new \Web64\Nlp\NlpClient( $this->nlpserver_config['hosts'], $this->nlpserver_config['debug'] );
13+
14+
// $newspaper = $nlp->newspaperUrl('http://www.bbc.com/news/science-environment-43710766');
15+
16+
// $this->msg( $newspaper );
17+
// $this->assertNotEmpty($newspaper);
18+
// }
19+
920
/** @test */
10-
public function url_article_extraction()
21+
public function html_article_extraction()
1122
{
1223
$nlp = new \Web64\Nlp\NlpClient( $this->nlpserver_config['hosts'], $this->nlpserver_config['debug'] );
1324

14-
$newspaper = $nlp->newspaperUrl('http://www.bbc.com/news/science-environment-43710766');
15-
25+
$html = file_get_contents( 'http://www.bbc.com/news/science-environment-43710766' );
26+
$newspaper = $nlp->newspaperHtml( $html );
27+
1628
//$this->msg( $newspaper );
29+
1730
$this->assertNotEmpty($newspaper);
31+
$this->assertNotEmpty($newspaper['title']);
32+
$this->assertNotEmpty($newspaper['text']);
1833
}
1934
}

tests/Unit/SpacyEntitiesTest.php

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<?php
2+
3+
namespace Tests\Unit;
4+
5+
use Tests\TestCase;
6+
7+
class SpacyEntitiesTest extends TestCase
8+
{
9+
/** @test */
10+
public function get_summary()
11+
{
12+
$nlp = new \Web64\Nlp\NlpClient( $this->nlpserver_config['hosts'], $this->nlpserver_config['debug'] );
13+
14+
$text = "D. B. Cooper is a media epithet popularly used to refer to an unidentified man who hijacked a Boeing 727 aircraft in the airspace between Portland, Oregon, and Seattle, Washington, on November 24, 1971. He extorted $200,000 in ransom (equivalent to $1,210,000 in 2017) and parachuted to an uncertain fate. Despite an extensive manhunt and protracted FBI investigation, the perpetrator has never been located or identified. The case remains the only unsolved air piracy in commercial aviation history.";
15+
$entities = $nlp->spacy_entities( $text );
16+
17+
//$this->msg( $entities );
18+
19+
$this->assertNotEmpty( $entities );
20+
$this->assertNotEmpty( $entities['PERSON'] );
21+
$this->assertNotEmpty( $entities['ORG'] );
22+
}
23+
}

0 commit comments

Comments
 (0)