Skip to content

Commit d8354d1

Browse files
author
Stanislav
committed
converting the pdfs to html and using the in search meta data
1 parent cfebe79 commit d8354d1

10 files changed

+249
-5
lines changed

css/os2web_cp_service.css

100644100755
File mode changed.

images/cal.png

100644100755
File mode changed.

js/jquery.qtip-1.0.0-rc3.min.js

100644100755
File mode changed.

js/os2web_cp_service.js

100644100755
File mode changed.

os2web_cp_service.info

100644100755
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ dependencies[] = menu
1717
dependencies[] = node
1818
dependencies[] = number
1919
dependencies[] = options
20-
dependencies[] = os2web_acadre_esdh
20+
;dependencies[] = os2web_acadre_esdh
2121
dependencies[] = page_manager
2222
dependencies[] = panels
2323
dependencies[] = pathauto

os2web_cp_service.install

100644100755
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,42 @@ function os2web_cp_service_install() {
1919
// taxonomy_term_save($term);
2020
// }
2121
}
22+
23+
function os2web_cp_service_schema() {
24+
$schema['os2web_cp_service_documents_conversion'] = array(
25+
'description' => 'This table is used as schedule for PDF -> HTML convertion of CP Documents',
26+
'fields' => array(
27+
'file_id' => array(
28+
'description' => 'The ID of the document file located remotely',
29+
'type' => 'varchar',
30+
'length' => 1024,
31+
'serialize' => TRUE,
32+
),
33+
'case_nid' => array(
34+
'description' => 'The nid of the CP Case, which metadata field should be updated with file content',
35+
'type' => 'int',
36+
'unsigned' => TRUE,
37+
),
38+
'filepath_pdf' => array(
39+
'description' => 'The path of the downloaded PDF',
40+
'type' => 'varchar',
41+
'length' => 1024,
42+
'serialize' => TRUE,
43+
),
44+
'filepath_html' => array(
45+
'description' => 'The path of the created HTML output',
46+
'type' => 'varchar',
47+
'length' => 1024,
48+
'serialize' => TRUE,
49+
),
50+
'status' => array(
51+
'description' => 'The information about the job',
52+
'type' => 'varchar',
53+
'length' => 1024,
54+
'serialize' => TRUE,
55+
),
56+
),
57+
'primary key' => array('case_nid'),
58+
);
59+
return $schema;
60+
}

os2web_cp_service.make

100644100755
File mode changed.

os2web_cp_service.module

100644100755
Lines changed: 208 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
*/
77

88
include_once 'os2web_cp_service.features.inc';
9+
define('DEFAULT_CASE_FILE_LIMIT', 50);//number of files to handle at once
910

1011
/**
1112
* Implements hook_init().
@@ -73,9 +74,7 @@ function os2web_cp_service_handler() {
7374
* Callback for the file provider service.
7475
*/
7576
function os2web_gf_service_handler($file_id) {
76-
7777
if ($url = variable_get('os2web_cp_service_cp_document_fileurl')) {
78-
7978
$username = variable_get('os2web_cp_service_endpoint_user');
8079
$password = variable_get('os2web_cp_service_endpoint_password');
8180
if (!empty($username) && !empty($password)) {
@@ -114,7 +113,6 @@ function os2web_gf_service_handler($file_id) {
114113
$nids = (isset($result['node']))?array_keys($result['node']) : NULL;
115114

116115
$node = node_load(array_pop($nids));
117-
118116
if ($node) {
119117
$filename = str_replace('/', '_', $node->field_os2web_cp_service_doc_id[LANGUAGE_NONE][0]['value'] . '.' . os2web_cp_service_get_extension_from_mime($header['content_type']));
120118
drupal_add_http_header('Content-Disposition', 'attachment; filename=' . $filename);
@@ -528,6 +526,8 @@ function os2web_cp_service_create_document(array $data) {
528526
}
529527
if ($is_missing) {
530528
$cnode->field_os2web_cp_service_doc_ref[LANGUAGE_NONE][]['target_id'] = $node->nid;
529+
//adding for pdf 2 html conversion, [email protected]
530+
os2web_cp_service_schedule_document_pdf2html_conversion($data['fields']['FilID'], $cnode->nid);
531531
node_save($cnode);
532532
}
533533
}
@@ -912,3 +912,208 @@ function os2web_cp_service_os2web_help($sections) {
912912
return $sections;
913913

914914
}
915+
/**
916+
* Schedules a document for pdf to html conversion by adding the document's to database table.
917+
*
918+
* @param string $file_id id of the file on remove server
919+
* @param int $case_nid nid of the destination case, which metadata field should be updated
920+
*
921+
* @return none
922+
*/
923+
function os2web_cp_service_schedule_document_pdf2html_conversion($file_id, $case_nid){
924+
db_insert('os2web_cp_service_documents_conversion')
925+
->fields(array(
926+
'file_id' => $file_id,
927+
'case_nid' => $case_nid,
928+
'filepath_pdf' => NULL,
929+
'filepath_html' => NULL,
930+
'status' => NULL,
931+
))
932+
->execute();
933+
}
934+
935+
/**
936+
* Cron implementation.
937+
* Goes through the enrties in database table,
938+
* downloads the documents as pdf,
939+
* and converts the documents from pdf to html and updates the case metadata field with the document contents.
940+
*
941+
* @return none
942+
*/
943+
function os2web_cp_service_cron(){
944+
//download
945+
$query = db_select('os2web_cp_service_documents_conversion', 'dc');
946+
$query->fields('dc',array('file_id'))
947+
->isNull('dc.status')
948+
->range(0,DEFAULT_CASE_FILE_LIMIT);
949+
$result = $query->execute();
950+
while($record = $result->fetchAssoc()) {
951+
_os2web_cp_service_document_download($record['file_id']);
952+
}
953+
954+
//convert
955+
$query = db_select('os2web_cp_service_documents_conversion', 'dc');
956+
$query->fields('dc',array('file_id', 'filepath_pdf'))
957+
->condition('dc.status', 'downloaded')
958+
->range(0,DEFAULT_CASE_FILE_LIMIT);
959+
$result = $query->execute();
960+
while($record = $result->fetchAssoc()) {
961+
_os2web_cp_service_document_convert($record['file_id'], $record['filepath_pdf']);
962+
}
963+
964+
//field updating
965+
$query = db_select('os2web_cp_service_documents_conversion', 'dc');
966+
$query->fields('dc',array('file_id', 'case_nid', 'filepath_pdf', 'filepath_html'))
967+
->condition('dc.status', 'converted')
968+
->range(0,DEFAULT_CASE_FILE_LIMIT);
969+
$result = $query->execute();
970+
while($record = $result->fetchAssoc()) {
971+
_os2web_cp_service_update_case_metadata($record['file_id'], $record['case_nid'], $record['filepath_pdf'], $record['filepath_html']);
972+
}
973+
}
974+
975+
/**
976+
* Download the document file and places it into the temporary location
977+
* also updates the database entry with the created filepath
978+
*
979+
* If file is not found on the remote server, the status is changed to "ERROR: 404 not found"
980+
* If ULR returns anything else than 200 http code, the status is chaned to "ERROR: {httpCode}"
981+
*
982+
* @param $file_id string id of the document file
983+
*
984+
* @return none
985+
*/
986+
function _os2web_cp_service_document_download($file_id){
987+
$url = $GLOBALS['base_url'] . '/?q=os2web/service/gf/v1/' . $file_id;//address of remote file
988+
$tmpfname = tempnam(file_directory_temp(), "os2web_cp_document_");//path to where the file will be downloaded
989+
990+
$fp = fopen($tmpfname, 'w');
991+
992+
$ch = curl_init($url);
993+
curl_setopt($ch, CURLOPT_FILE, $fp);
994+
$data= curl_exec($ch);
995+
996+
//Check for 404 (file not found)
997+
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
998+
curl_close($ch);
999+
fclose($fp);
1000+
1001+
if ($httpCode == 404) {
1002+
print_r('404');
1003+
db_update('os2web_cp_service_documents_conversion')
1004+
->fields(array(
1005+
'status' => 'ERROR: 404 not found',
1006+
))
1007+
->condition('file_id', $file_id)
1008+
->execute();
1009+
} else if ($httpCode == 200) {
1010+
db_update('os2web_cp_service_documents_conversion')
1011+
->fields(array(
1012+
'filepath_pdf' => $tmpfname,
1013+
'status' => 'downloaded',
1014+
))
1015+
->condition('file_id', $file_id)
1016+
->execute();
1017+
} else {//unknown error
1018+
db_update('os2web_cp_service_documents_conversion')
1019+
->fields(array(
1020+
'status' => 'ERROR: ' . $httpCode,
1021+
))
1022+
->condition('file_id', $file_id)
1023+
->execute();
1024+
}
1025+
}
1026+
1027+
/**
1028+
* Converts the downloaded pdf into HTML
1029+
* also updates the database entry with the created filepath
1030+
*
1031+
* If PDF file is not found, the status is changes to NULL, so that the next cron job will download the file again.
1032+
*
1033+
* @param $file_id string id of the document file
1034+
* @param $path_to_pdf string path to the pdf version of the file
1035+
*
1036+
* @return none
1037+
*/
1038+
function _os2web_cp_service_document_convert($file_id, $path_to_pdf){
1039+
if (!file_exists($path_to_pdf)){//if does not exist, send for redownloading
1040+
db_update('os2web_cp_service_documents_conversion')
1041+
->fields(array(
1042+
'filepath_pdf' => null,
1043+
'status' => null,
1044+
))
1045+
->condition('file_id', $file_id)
1046+
->execute();
1047+
} else {
1048+
shell_exec('pdf2htmlEX ' . $path_to_pdf . ' --dest-dir ' . file_directory_temp());
1049+
db_update('os2web_cp_service_documents_conversion')
1050+
->fields(array(
1051+
'filepath_html' => $path_to_pdf . '.html',
1052+
'status' => 'converted',
1053+
))
1054+
->condition('file_id', $file_id)
1055+
->execute();
1056+
}
1057+
}
1058+
1059+
/**
1060+
* Takes the convent of the html, removed everything except the pure text and then appends this text to case node search metadata field.
1061+
* In the end the temp files (pdf and html) are deleted.
1062+
*
1063+
* If HTML file is not found, the status will be changed to "downloaded" so that the text cron job will convert the file again.
1064+
* If case node is not found, the status will be chaned to "ERROR: node not found".
1065+
*
1066+
* @param $file_id string id of the document file
1067+
* @param $case_nid the node id the case node, which metadata should be updated
1068+
* @param $path_to_pdf string path to the pdf version of the file
1069+
* @param $path_to_html string path to the html version of the file
1070+
*
1071+
* @return none
1072+
*/
1073+
function _os2web_cp_service_update_case_metadata($file_id, $case_nid, $path_to_pdf, $path_to_html){
1074+
if (!file_exists($path_to_html)){//if does not exist, send for reconverting. PDF existence will be checked on that step as well.
1075+
db_update('os2web_cp_service_documents_conversion')
1076+
->fields(array(
1077+
'filepath_html' => null,
1078+
'status' => 'downloaded',
1079+
))
1080+
->condition('file_id', $file_id)
1081+
->execute();
1082+
} else {
1083+
$case_node = node_load($case_nid);
1084+
if (!$case_node){
1085+
db_update('os2web_cp_service_documents_conversion')
1086+
->fields(array(
1087+
'status' => 'ERROR: node not found',
1088+
))
1089+
->condition('file_id', $file_id)
1090+
->execute();
1091+
} else {
1092+
$text = file_get_contents($path_to_html);
1093+
1094+
//html tags removing
1095+
$text = str_replace('<p>&nbsp;</p>', ' ', $text); //removing unneeded paragraphs
1096+
$text = preg_replace('#<style(.*?)>(.*?)</style>#is', ' ', $text);//removing style tags
1097+
$text = preg_replace('#<script(.*?)>(.*?)</script>#is', ' ', $text);//removing scripts tags
1098+
$text = str_replace("\r\n", " ", strip_tags($text));
1099+
$text = str_replace("\n\r", " ", $text);
1100+
$text = str_replace("\n", " ", $text);
1101+
$text = str_replace("\r", " ", $text);
1102+
1103+
$search_metadata = $case_node->field_os2web_cp_service_searchmt['und'][0]['value'] . $text;
1104+
$case_node->field_os2web_cp_service_searchmt['und'][0]['value'] = $search_metadata;
1105+
node_save($case_node);
1106+
1107+
db_update('os2web_cp_service_documents_conversion')
1108+
->fields(array(
1109+
'status' => 'done',
1110+
))
1111+
->condition('file_id', $file_id)
1112+
->execute();
1113+
1114+
//tmp files cleanup
1115+
unlink($path_to_html);
1116+
unlink($path_to_pdf);
1117+
}
1118+
}
1119+
}

os2web_cp_service.strongarm.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ function os2web_cp_service_strongarm() {
8787
$strongarm->disabled = FALSE; /* Edit this to true to make a default strongarm disabled initially */
8888
$strongarm->api_version = 1;
8989
$strongarm->name = 'pathauto_punctuation_slash';
90-
$strongarm->value = '2';
90+
$strongarm->value = '0';
9191
$export['pathauto_punctuation_slash'] = $strongarm;
9292

9393
return $export;

theme/views-exposed-form--os2web-cp-service-cp-case-search.tpl.php

100644100755
File mode changed.

0 commit comments

Comments
 (0)