1
+ <?php
2
+ /**
3
+ * Created by PhpStorm.
4
+ * User: charles
5
+ * Date: 15/04/2015
6
+ * Time: 14:52
7
+ */
8
+
9
+ abstract class AbstractSearchEngineIndexer extends AJXP_AbstractMetaSource {
10
+
11
+ /**
12
+ * @param AJXP_Node $ajxpNode
13
+ * @return null|string
14
+ */
15
+ protected function extractIndexableContent ($ ajxpNode ){
16
+
17
+ $ ext = strtolower (pathinfo ($ ajxpNode ->getLabel (), PATHINFO_EXTENSION ));
18
+ if (in_array ($ ext , explode (", " ,$ this ->getFilteredOption ("PARSE_CONTENT_TXT " )))) {
19
+ return file_get_contents ($ ajxpNode ->getUrl ());
20
+ }
21
+ $ unoconv = $ this ->getFilteredOption ("UNOCONV " );
22
+ $ pipe = false ;
23
+ if (!empty ($ unoconv ) && in_array ($ ext , array ("doc " , "odt " , "xls " , "ods " ))) {
24
+ $ targetExt = "txt " ;
25
+ if (in_array ($ ext , array ("xls " , "ods " ))) {
26
+ $ targetExt = "csv " ;
27
+ } else if (in_array ($ ext , array ("odp " , "ppt " ))) {
28
+ $ targetExt = "pdf " ;
29
+ $ pipe = true ;
30
+ }
31
+ $ realFile = call_user_func (array ($ ajxpNode ->wrapperClassName , "getRealFSReference " ), $ ajxpNode ->getUrl ());
32
+ $ unoconv = "HOME= " .AJXP_Utils::getAjxpTmpDir ()." " .$ unoconv ." --stdout -f $ targetExt " .escapeshellarg ($ realFile );
33
+ if ($ pipe ) {
34
+ $ newTarget = str_replace (". $ ext " , ".pdf " , $ realFile );
35
+ $ unoconv .= " > $ newTarget " ;
36
+ register_shutdown_function ("unlink " , $ newTarget );
37
+ }
38
+ $ output = array ();
39
+ exec ($ unoconv , $ output , $ return );
40
+ if (!$ pipe ) {
41
+ $ out = implode ("\n" , $ output );
42
+ $ enc = 'ISO-8859-1 ' ;
43
+ $ asciiString = iconv ($ enc , 'ASCII//TRANSLIT//IGNORE ' , $ out );
44
+ return $ asciiString ;
45
+ } else {
46
+ $ ext = "pdf " ;
47
+ }
48
+ }
49
+ $ pdftotext = $ this ->getFilteredOption ("PDFTOTEXT " );
50
+ if (!empty ($ pdftotext ) && in_array ($ ext , array ("pdf " ))) {
51
+ $ realFile = call_user_func (array ($ ajxpNode ->wrapperClassName , "getRealFSReference " ), $ ajxpNode ->getUrl ());
52
+ if ($ pipe && isset ($ newTarget ) && is_file ($ newTarget )) {
53
+ $ realFile = $ newTarget ;
54
+ }
55
+ $ cmd = $ pdftotext ." " .escapeshellarg ($ realFile )." - " ;
56
+ $ output = array ();
57
+ exec ($ cmd , $ output , $ return );
58
+ $ out = implode ("\n" , $ output );
59
+ $ enc = 'UTF8 ' ;
60
+ $ asciiString = iconv ($ enc , 'ASCII//TRANSLIT//IGNORE ' , $ out );
61
+ return $ asciiString ;
62
+ }
63
+ return null ;
64
+ }
65
+
66
+ }
0 commit comments