6
6
*/
7
7
8
8
include_once 'os2web_cp_service.features.inc';
9
+ define('DEFAULT_CASE_FILE_LIMIT', 50);//number of files to handle at once
9
10
10
11
/**
11
12
* Implements hook_init().
@@ -73,9 +74,7 @@ function os2web_cp_service_handler() {
73
74
* Callback for the file provider service.
74
75
*/
75
76
function os2web_gf_service_handler($file_id) {
76
-
77
77
if ($url = variable_get('os2web_cp_service_cp_document_fileurl')) {
78
-
79
78
$username = variable_get('os2web_cp_service_endpoint_user');
80
79
$password = variable_get('os2web_cp_service_endpoint_password');
81
80
if (!empty($username) && !empty($password)) {
@@ -114,7 +113,6 @@ function os2web_gf_service_handler($file_id) {
114
113
$nids = (isset($result['node']))?array_keys($result['node']) : NULL;
115
114
116
115
$node = node_load(array_pop($nids));
117
-
118
116
if ($node) {
119
117
$filename = str_replace('/', '_', $node->field_os2web_cp_service_doc_id[LANGUAGE_NONE][0]['value'] . '.' . os2web_cp_service_get_extension_from_mime($header['content_type']));
120
118
drupal_add_http_header('Content-Disposition', 'attachment; filename=' . $filename);
@@ -528,6 +526,8 @@ function os2web_cp_service_create_document(array $data) {
528
526
}
529
527
if ($is_missing) {
530
528
$cnode->field_os2web_cp_service_doc_ref[LANGUAGE_NONE][]['target_id'] = $node->nid;
529
+ //adding for pdf 2 html conversion,
[email protected]
530
+ os2web_cp_service_schedule_document_pdf2html_conversion($data['fields']['FilID'], $cnode->nid);
531
531
node_save($cnode);
532
532
}
533
533
}
@@ -912,3 +912,208 @@ function os2web_cp_service_os2web_help($sections) {
912
912
return $sections;
913
913
914
914
}
915
+ /**
916
+ * Schedules a document for pdf to html conversion by adding the document's to database table.
917
+ *
918
+ * @param string $file_id id of the file on remove server
919
+ * @param int $case_nid nid of the destination case, which metadata field should be updated
920
+ *
921
+ * @return none
922
+ */
923
+ function os2web_cp_service_schedule_document_pdf2html_conversion($file_id, $case_nid){
924
+ db_insert('os2web_cp_service_documents_conversion')
925
+ ->fields(array(
926
+ 'file_id' => $file_id,
927
+ 'case_nid' => $case_nid,
928
+ 'filepath_pdf' => NULL,
929
+ 'filepath_html' => NULL,
930
+ 'status' => NULL,
931
+ ))
932
+ ->execute();
933
+ }
934
+
935
+ /**
936
+ * Cron implementation.
937
+ * Goes through the enrties in database table,
938
+ * downloads the documents as pdf,
939
+ * and converts the documents from pdf to html and updates the case metadata field with the document contents.
940
+ *
941
+ * @return none
942
+ */
943
+ function os2web_cp_service_cron(){
944
+ //download
945
+ $query = db_select('os2web_cp_service_documents_conversion', 'dc');
946
+ $query->fields('dc',array('file_id'))
947
+ ->isNull('dc.status')
948
+ ->range(0,DEFAULT_CASE_FILE_LIMIT);
949
+ $result = $query->execute();
950
+ while($record = $result->fetchAssoc()) {
951
+ _os2web_cp_service_document_download($record['file_id']);
952
+ }
953
+
954
+ //convert
955
+ $query = db_select('os2web_cp_service_documents_conversion', 'dc');
956
+ $query->fields('dc',array('file_id', 'filepath_pdf'))
957
+ ->condition('dc.status', 'downloaded')
958
+ ->range(0,DEFAULT_CASE_FILE_LIMIT);
959
+ $result = $query->execute();
960
+ while($record = $result->fetchAssoc()) {
961
+ _os2web_cp_service_document_convert($record['file_id'], $record['filepath_pdf']);
962
+ }
963
+
964
+ //field updating
965
+ $query = db_select('os2web_cp_service_documents_conversion', 'dc');
966
+ $query->fields('dc',array('file_id', 'case_nid', 'filepath_pdf', 'filepath_html'))
967
+ ->condition('dc.status', 'converted')
968
+ ->range(0,DEFAULT_CASE_FILE_LIMIT);
969
+ $result = $query->execute();
970
+ while($record = $result->fetchAssoc()) {
971
+ _os2web_cp_service_update_case_metadata($record['file_id'], $record['case_nid'], $record['filepath_pdf'], $record['filepath_html']);
972
+ }
973
+ }
974
+
975
+ /**
976
+ * Download the document file and places it into the temporary location
977
+ * also updates the database entry with the created filepath
978
+ *
979
+ * If file is not found on the remote server, the status is changed to "ERROR: 404 not found"
980
+ * If ULR returns anything else than 200 http code, the status is chaned to "ERROR: {httpCode}"
981
+ *
982
+ * @param $file_id string id of the document file
983
+ *
984
+ * @return none
985
+ */
986
+ function _os2web_cp_service_document_download($file_id){
987
+ $url = $GLOBALS['base_url'] . '/?q=os2web/service/gf/v1/' . $file_id;//address of remote file
988
+ $tmpfname = tempnam(file_directory_temp(), "os2web_cp_document_");//path to where the file will be downloaded
989
+
990
+ $fp = fopen($tmpfname, 'w');
991
+
992
+ $ch = curl_init($url);
993
+ curl_setopt($ch, CURLOPT_FILE, $fp);
994
+ $data= curl_exec($ch);
995
+
996
+ //Check for 404 (file not found)
997
+ $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
998
+ curl_close($ch);
999
+ fclose($fp);
1000
+
1001
+ if ($httpCode == 404) {
1002
+ print_r('404');
1003
+ db_update('os2web_cp_service_documents_conversion')
1004
+ ->fields(array(
1005
+ 'status' => 'ERROR: 404 not found',
1006
+ ))
1007
+ ->condition('file_id', $file_id)
1008
+ ->execute();
1009
+ } else if ($httpCode == 200) {
1010
+ db_update('os2web_cp_service_documents_conversion')
1011
+ ->fields(array(
1012
+ 'filepath_pdf' => $tmpfname,
1013
+ 'status' => 'downloaded',
1014
+ ))
1015
+ ->condition('file_id', $file_id)
1016
+ ->execute();
1017
+ } else {//unknown error
1018
+ db_update('os2web_cp_service_documents_conversion')
1019
+ ->fields(array(
1020
+ 'status' => 'ERROR: ' . $httpCode,
1021
+ ))
1022
+ ->condition('file_id', $file_id)
1023
+ ->execute();
1024
+ }
1025
+ }
1026
+
1027
+ /**
1028
+ * Converts the downloaded pdf into HTML
1029
+ * also updates the database entry with the created filepath
1030
+ *
1031
+ * If PDF file is not found, the status is changes to NULL, so that the next cron job will download the file again.
1032
+ *
1033
+ * @param $file_id string id of the document file
1034
+ * @param $path_to_pdf string path to the pdf version of the file
1035
+ *
1036
+ * @return none
1037
+ */
1038
+ function _os2web_cp_service_document_convert($file_id, $path_to_pdf){
1039
+ if (!file_exists($path_to_pdf)){//if does not exist, send for redownloading
1040
+ db_update('os2web_cp_service_documents_conversion')
1041
+ ->fields(array(
1042
+ 'filepath_pdf' => null,
1043
+ 'status' => null,
1044
+ ))
1045
+ ->condition('file_id', $file_id)
1046
+ ->execute();
1047
+ } else {
1048
+ shell_exec('pdf2htmlEX ' . $path_to_pdf . ' --dest-dir ' . file_directory_temp());
1049
+ db_update('os2web_cp_service_documents_conversion')
1050
+ ->fields(array(
1051
+ 'filepath_html' => $path_to_pdf . '.html',
1052
+ 'status' => 'converted',
1053
+ ))
1054
+ ->condition('file_id', $file_id)
1055
+ ->execute();
1056
+ }
1057
+ }
1058
+
1059
+ /**
1060
+ * Takes the convent of the html, removed everything except the pure text and then appends this text to case node search metadata field.
1061
+ * In the end the temp files (pdf and html) are deleted.
1062
+ *
1063
+ * If HTML file is not found, the status will be changed to "downloaded" so that the text cron job will convert the file again.
1064
+ * If case node is not found, the status will be chaned to "ERROR: node not found".
1065
+ *
1066
+ * @param $file_id string id of the document file
1067
+ * @param $case_nid the node id the case node, which metadata should be updated
1068
+ * @param $path_to_pdf string path to the pdf version of the file
1069
+ * @param $path_to_html string path to the html version of the file
1070
+ *
1071
+ * @return none
1072
+ */
1073
+ function _os2web_cp_service_update_case_metadata($file_id, $case_nid, $path_to_pdf, $path_to_html){
1074
+ if (!file_exists($path_to_html)){//if does not exist, send for reconverting. PDF existence will be checked on that step as well.
1075
+ db_update('os2web_cp_service_documents_conversion')
1076
+ ->fields(array(
1077
+ 'filepath_html' => null,
1078
+ 'status' => 'downloaded',
1079
+ ))
1080
+ ->condition('file_id', $file_id)
1081
+ ->execute();
1082
+ } else {
1083
+ $case_node = node_load($case_nid);
1084
+ if (!$case_node){
1085
+ db_update('os2web_cp_service_documents_conversion')
1086
+ ->fields(array(
1087
+ 'status' => 'ERROR: node not found',
1088
+ ))
1089
+ ->condition('file_id', $file_id)
1090
+ ->execute();
1091
+ } else {
1092
+ $text = file_get_contents($path_to_html);
1093
+
1094
+ //html tags removing
1095
+ $text = str_replace('<p> </p>', ' ', $text); //removing unneeded paragraphs
1096
+ $text = preg_replace('#<style(.*?)>(.*?)</style>#is', ' ', $text);//removing style tags
1097
+ $text = preg_replace('#<script(.*?)>(.*?)</script>#is', ' ', $text);//removing scripts tags
1098
+ $text = str_replace("\r\n", " ", strip_tags($text));
1099
+ $text = str_replace("\n\r", " ", $text);
1100
+ $text = str_replace("\n", " ", $text);
1101
+ $text = str_replace("\r", " ", $text);
1102
+
1103
+ $search_metadata = $case_node->field_os2web_cp_service_searchmt['und'][0]['value'] . $text;
1104
+ $case_node->field_os2web_cp_service_searchmt['und'][0]['value'] = $search_metadata;
1105
+ node_save($case_node);
1106
+
1107
+ db_update('os2web_cp_service_documents_conversion')
1108
+ ->fields(array(
1109
+ 'status' => 'done',
1110
+ ))
1111
+ ->condition('file_id', $file_id)
1112
+ ->execute();
1113
+
1114
+ //tmp files cleanup
1115
+ unlink($path_to_html);
1116
+ unlink($path_to_pdf);
1117
+ }
1118
+ }
1119
+ }
0 commit comments