6
6
*/
7
7
8
8
include_once 'os2web_cp_service.features.inc';
9
- define('DEFAULT_CASE_FILE_LIMIT', 50);//number of files to handle at once
10
9
11
10
/**
12
11
* Implements hook_init().
@@ -66,7 +65,9 @@ function os2web_cp_service_handler() {
66
65
* Callback for the file provider service.
67
66
*/
68
67
function os2web_gf_service_handler($file_id) {
68
+
69
69
if ($url = variable_get('os2web_cp_service_cp_document_fileurl')) {
70
+
70
71
$username = variable_get('os2web_cp_service_endpoint_user');
71
72
$password = variable_get('os2web_cp_service_endpoint_password');
72
73
if (!empty($username) && !empty($password)) {
@@ -105,6 +106,7 @@ function os2web_gf_service_handler($file_id) {
105
106
$nids = (isset($result['node']))?array_keys($result['node']) : NULL;
106
107
107
108
$node = node_load(array_pop($nids));
109
+
108
110
if ($node) {
109
111
$filename = str_replace('/', '_', $node->field_os2web_cp_service_doc_id[LANGUAGE_NONE][0]['value'] . '.' . os2web_cp_service_get_extension_from_mime($header['content_type']));
110
112
drupal_add_http_header('Content-Disposition', 'attachment; filename=' . $filename);
@@ -518,8 +520,6 @@ function os2web_cp_service_create_document(array $data) {
518
520
}
519
521
if ($is_missing) {
520
522
$cnode->field_os2web_cp_service_doc_ref[LANGUAGE_NONE][]['target_id'] = $node->nid;
521
- //adding for pdf 2 html conversion,
[email protected]
522
- os2web_cp_service_schedule_document_pdf2html_conversion($data['fields']['FilID'], $cnode->nid);
523
523
node_save($cnode);
524
524
}
525
525
}
@@ -881,209 +881,3 @@ function os2web_cp_service_get_extension_from_mime($mime) {
881
881
$pieces = explode('/', $mime);
882
882
return array_pop($pieces);
883
883
}
884
-
885
- /**
886
- * Schedules a document for pdf to html conversion by adding the document's to database table.
887
- *
888
- * @param string $file_id id of the file on remove server
889
- * @param int $case_nid nid of the destination case, which metadata field should be updated
890
- *
891
- * @return none
892
- */
893
- function os2web_cp_service_schedule_document_pdf2html_conversion($file_id, $case_nid){
894
- db_insert('os2web_cp_service_documents_conversion')
895
- ->fields(array(
896
- 'file_id' => $file_id,
897
- 'case_nid' => $case_nid,
898
- 'filepath_pdf' => NULL,
899
- 'filepath_html' => NULL,
900
- 'status' => NULL,
901
- ))
902
- ->execute();
903
- }
904
-
905
- /**
906
- * Cron implementation.
907
- * Goes through the enrties in database table,
908
- * downloads the documents as pdf,
909
- * and converts the documents from pdf to html and updates the case metadata field with the document contents.
910
- *
911
- * @return none
912
- */
913
- function os2web_cp_service_cron(){
914
- //download
915
- $query = db_select('os2web_cp_service_documents_conversion', 'dc');
916
- $query->fields('dc',array('file_id'))
917
- ->isNull('dc.status')
918
- ->range(0,DEFAULT_CASE_FILE_LIMIT);
919
- $result = $query->execute();
920
- while($record = $result->fetchAssoc()) {
921
- _os2web_cp_service_document_download($record['file_id']);
922
- }
923
-
924
- //convert
925
- $query = db_select('os2web_cp_service_documents_conversion', 'dc');
926
- $query->fields('dc',array('file_id', 'filepath_pdf'))
927
- ->condition('dc.status', 'downloaded')
928
- ->range(0,DEFAULT_CASE_FILE_LIMIT);
929
- $result = $query->execute();
930
- while($record = $result->fetchAssoc()) {
931
- _os2web_cp_service_document_convert($record['file_id'], $record['filepath_pdf']);
932
- }
933
-
934
- //field updating
935
- $query = db_select('os2web_cp_service_documents_conversion', 'dc');
936
- $query->fields('dc',array('file_id', 'case_nid', 'filepath_pdf', 'filepath_html'))
937
- ->condition('dc.status', 'converted')
938
- ->range(0,DEFAULT_CASE_FILE_LIMIT);
939
- $result = $query->execute();
940
- while($record = $result->fetchAssoc()) {
941
- _os2web_cp_service_update_case_metadata($record['file_id'], $record['case_nid'], $record['filepath_pdf'], $record['filepath_html']);
942
- }
943
- }
944
-
945
- /**
946
- * Download the document file and places it into the temporary location
947
- * also updates the database entry with the created filepath
948
- *
949
- * If file is not found on the remote server, the status is changed to "ERROR: 404 not found"
950
- * If ULR returns anything else than 200 http code, the status is chaned to "ERROR: {httpCode}"
951
- *
952
- * @param $file_id string id of the document file
953
- *
954
- * @return none
955
- */
956
- function _os2web_cp_service_document_download($file_id){
957
- $url = $GLOBALS['base_url'] . '/?q=os2web/service/gf/v1/' . $file_id;//address of remote file
958
- $tmpfname = tempnam(file_directory_temp(), "os2web_cp_document_");//path to where the file will be downloaded
959
-
960
- $fp = fopen($tmpfname, 'w');
961
-
962
- $ch = curl_init($url);
963
- curl_setopt($ch, CURLOPT_FILE, $fp);
964
- $data= curl_exec($ch);
965
-
966
- //Check for 404 (file not found)
967
- $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
968
- curl_close($ch);
969
- fclose($fp);
970
-
971
- if ($httpCode == 404) {
972
- print_r('404');
973
- db_update('os2web_cp_service_documents_conversion')
974
- ->fields(array(
975
- 'status' => 'ERROR: 404 not found',
976
- ))
977
- ->condition('file_id', $file_id)
978
- ->execute();
979
- } else if ($httpCode == 200) {
980
- db_update('os2web_cp_service_documents_conversion')
981
- ->fields(array(
982
- 'filepath_pdf' => $tmpfname,
983
- 'status' => 'downloaded',
984
- ))
985
- ->condition('file_id', $file_id)
986
- ->execute();
987
- } else {//unknown error
988
- db_update('os2web_cp_service_documents_conversion')
989
- ->fields(array(
990
- 'status' => 'ERROR: ' . $httpCode,
991
- ))
992
- ->condition('file_id', $file_id)
993
- ->execute();
994
- }
995
- }
996
-
997
- /**
998
- * Converts the downloaded pdf into HTML
999
- * also updates the database entry with the created filepath
1000
- *
1001
- * If PDF file is not found, the status is changes to NULL, so that the next cron job will download the file again.
1002
- *
1003
- * @param $file_id string id of the document file
1004
- * @param $path_to_pdf string path to the pdf version of the file
1005
- *
1006
- * @return none
1007
- */
1008
- function _os2web_cp_service_document_convert($file_id, $path_to_pdf){
1009
- if (!file_exists($path_to_pdf)){//if does not exist, send for redownloading
1010
- db_update('os2web_cp_service_documents_conversion')
1011
- ->fields(array(
1012
- 'filepath_pdf' => null,
1013
- 'status' => null,
1014
- ))
1015
- ->condition('file_id', $file_id)
1016
- ->execute();
1017
- } else {
1018
- shell_exec('pdf2htmlEX ' . $path_to_pdf . ' --dest-dir ' . file_directory_temp());
1019
- db_update('os2web_cp_service_documents_conversion')
1020
- ->fields(array(
1021
- 'filepath_html' => $path_to_pdf . '.html',
1022
- 'status' => 'converted',
1023
- ))
1024
- ->condition('file_id', $file_id)
1025
- ->execute();
1026
- }
1027
- }
1028
-
1029
- /**
1030
- * Takes the convent of the html, removed everything except the pure text and then appends this text to case node search metadata field.
1031
- * In the end the temp files (pdf and html) are deleted.
1032
- *
1033
- * If HTML file is not found, the status will be changed to "downloaded" so that the text cron job will convert the file again.
1034
- * If case node is not found, the status will be chaned to "ERROR: node not found".
1035
- *
1036
- * @param $file_id string id of the document file
1037
- * @param $case_nid the node id the case node, which metadata should be updated
1038
- * @param $path_to_pdf string path to the pdf version of the file
1039
- * @param $path_to_html string path to the html version of the file
1040
- *
1041
- * @return none
1042
- */
1043
- function _os2web_cp_service_update_case_metadata($file_id, $case_nid, $path_to_pdf, $path_to_html){
1044
- if (!file_exists($path_to_html)){//if does not exist, send for reconverting. PDF existence will be checked on that step as well.
1045
- db_update('os2web_cp_service_documents_conversion')
1046
- ->fields(array(
1047
- 'filepath_html' => null,
1048
- 'status' => 'downloaded',
1049
- ))
1050
- ->condition('file_id', $file_id)
1051
- ->execute();
1052
- } else {
1053
- $case_node = node_load($case_nid);
1054
- if (!$case_node){
1055
- db_update('os2web_cp_service_documents_conversion')
1056
- ->fields(array(
1057
- 'status' => 'ERROR: node not found',
1058
- ))
1059
- ->condition('file_id', $file_id)
1060
- ->execute();
1061
- } else {
1062
- $text = file_get_contents($path_to_html);
1063
-
1064
- //html tags removing
1065
- $text = str_replace('<p> </p>', ' ', $text); //removing unneeded paragraphs
1066
- $text = preg_replace('#<style(.*?)>(.*?)</style>#is', ' ', $text);//removing style tags
1067
- $text = preg_replace('#<script(.*?)>(.*?)</script>#is', ' ', $text);//removing scripts tags
1068
- $text = str_replace("\r\n", " ", strip_tags($text));
1069
- $text = str_replace("\n\r", " ", $text);
1070
- $text = str_replace("\n", " ", $text);
1071
- $text = str_replace("\r", " ", $text);
1072
-
1073
- $search_metadata = $case_node->field_os2web_cp_service_searchmt['und'][0]['value'] . $text;
1074
- $case_node->field_os2web_cp_service_searchmt['und'][0]['value'] = $search_metadata;
1075
- node_save($case_node);
1076
-
1077
- db_update('os2web_cp_service_documents_conversion')
1078
- ->fields(array(
1079
- 'status' => 'done',
1080
- ))
1081
- ->condition('file_id', $file_id)
1082
- ->execute();
1083
-
1084
- //tmp files cleanup
1085
- unlink($path_to_html);
1086
- unlink($path_to_pdf);
1087
- }
1088
- }
1089
- }
0 commit comments