@@ -269,18 +269,6 @@ private void grabPage(String url, String outputDirPath) throws Exception {
269269
270270 }
271271
272- for (String f : GrabUtility .framesToGrab ) {
273- lt .d (f );
274- }
275- for (String f : GrabUtility .cssToGrab ) {
276- lt .d (f );
277- }
278- for (String f : GrabUtility .extraCssToGrab ) {
279- lt .d (f );
280- }
281- for (String f : GrabUtility .filesToGrab ) {
282- lt .d (f );
283- }
284272 //download extra files, such as images / scripts
285273 for (String urlToDownload : GrabUtility .filesToGrab ) {
286274
@@ -729,8 +717,8 @@ private void getExtraFile(String urlToDownload, File outputDir) {
729717}
730718
731719/**
732- * @author Pramod Khare & improved by Jonas Czec
733- * Contains all the utility methods used in above GrabWebPage class
720+ * @author Pramod Khare & improved by Jonas Czech
721+ * Contains all the utility methods used in above class
734722 */
735723class GrabUtility {
736724 // filesToGrab - maintains all the links to files which we are going to grab/download
@@ -762,7 +750,7 @@ public static String parseHtmlForLinks(String htmlToParse, String baseUrl) {
762750 try {
763751 fromHTMLPageUrl = new URL (baseUrl );
764752 noBaseUrl = false ;
765- } catch (java . net . MalformedURLException e ) {
753+ } catch (MalformedURLException e ) {
766754 fromHTMLPageUrl = null ;
767755 noBaseUrl = true ;
768756 }
@@ -872,14 +860,24 @@ public static String parseHtmlForLinks(String htmlToParse, String baseUrl) {
872860 }
873861
874862 if (saveVideo ) {
875- links = parsedHtml .select ("video" );
863+ //video src is sometimes in a child element
864+ links = parsedHtml .select ("video:not([src])" );
876865 for (Element link : links .select ("[src]" )){
877866 urlToGrab = link .attr ("abs:src" );
878867 addLinkToList (urlToGrab );
879868
880869 String replacedURL = urlToGrab .substring (urlToGrab .lastIndexOf ("/" ) + 1 ).replaceAll ("[^a-zA-Z0-9-_\\ .]" , "_" );
881870 link .attr ("src" , replacedURL );
882871 }
872+
873+ links = parsedHtml .select ("video[src]" );
874+ for (Element link : links ){
875+ urlToGrab = link .attr ("abs:src" );
876+ addLinkToList (urlToGrab );
877+
878+ String replacedURL = urlToGrab .substring (urlToGrab .lastIndexOf ("/" ) + 1 ).replaceAll ("[^a-zA-Z0-9-_\\ .]" , "_" );
879+ link .attr ("src" , replacedURL );
880+ }
883881 }
884882
885883 if (makeLinksAbsolute ) {
@@ -896,8 +894,6 @@ public static String parseHtmlForLinks(String htmlToParse, String baseUrl) {
896894
897895 public static String parseCssForLinks (String cssToParse , String baseUrl ) {
898896
899-
900- String importString = "@(import\\ s*['\" ])()([^ '\" ]*)" ;
901897 String patternString = "url(\\ s*\\ (\\ s*['\" ]*\\ s*)(.*?)\\ s*['\" ]*\\ s*\\ )" ; //I hate regexes...
902898
903899 Pattern pattern = Pattern .compile (patternString );
@@ -915,9 +911,11 @@ public static String parseCssForLinks(String cssToParse, String baseUrl) {
915911
916912 }
917913
918- addLinkToList (makeLinkRelative (matcher .group ().replaceAll (patternString , "$2" ).trim (), baseUrl ));
914+ addLinkToList (makeLinkAbsolute (matcher .group ().replaceAll (patternString , "$2" ).trim (), baseUrl ));
919915 }
920916
917+ // find css linked with @import - needs testing
918+ String importString = "@(import\\ s*['\" ])()([^ '\" ]*)" ;
921919 pattern = Pattern .compile (importString );
922920 matcher = pattern .matcher (cssToParse );
923921 matcher .reset ();
@@ -930,30 +928,38 @@ public static String parseCssForLinks(String cssToParse, String baseUrl) {
930928
931929 }
932930
933- extraCssToGrab .add (makeLinkRelative (matcher .group ().replaceAll (patternString , "$2" ).trim (), baseUrl ));
931+ extraCssToGrab .add (makeLinkAbsolute (matcher .group ().replaceAll (patternString , "$2" ).trim (), baseUrl ));
934932 }
935933
936934 return cssToParse ;
937935 }
938936
939937 public static void addLinkToList (String link ) {
938+ //no multithreading for now
940939 synchronized (filesToGrab ) {
941940 if (!filesToGrab .contains (link )) {
942941 filesToGrab .add (link );
943942 }
944943 }
945944 }
946945
947- public static String makeLinkRelative (String link , String baseurl ) {
948- //jsoup figures out the absolute url for me...
949- Document d = Document .createShell (baseurl );
950- d .body ().appendElement ("img" ).attr ("src" , link );
951- return d .body ().child (0 ).attr ("abs:src" );
946+ public static String makeLinkAbsolute (String link , String baseurl ) {
952947
948+ try {
949+ URL u = new URL (new URL (baseurl ), link );
950+ return u .toString ();
951+ } catch (MalformedURLException e ) {
952+ lt .e ("MalformedURLException while making url absolute" );
953+ lt .e ("Link: " + link );
954+ lt .e ("BaseURL: " + baseurl );
955+ return null ;
956+ }
957+
953958 }
954959}
955960
956961class lt {
962+ //log messages are sent here
957963 public static void e (String message ) {
958964 Log .e ("SaveService" , message );
959965 }
0 commit comments