diff --git a/data-upload/README.md b/data-upload/README.md new file mode 100644 index 00000000..1cc02f4d --- /dev/null +++ b/data-upload/README.md @@ -0,0 +1,10 @@ +# Data uploading tool +Use this command line tool to upload json files to MongoBD server. + +You can run this tool either in Intellij or in command line. + +If you are using command line, first build this tool by running`./gradlew build`. +Find the distribution in the `build/distribution` directory, untar the compressed file and +you should fine executables in the `bin` directory. + +There are two arguments in this tool. Specify json files directory by using `-d` and specify MongoDB endpoint by using `-e`. \ No newline at end of file diff --git a/data-upload/build.gradle b/data-upload/build.gradle index eb596185..116f6317 100644 --- a/data-upload/build.gradle +++ b/data-upload/build.gradle @@ -1,9 +1,14 @@ plugins { id 'java' + id 'application' } -group 'org.example' -version '1.0-SNAPSHOT' +group 'org.techVault.webScrapping' +version '0.0.1-SNAPSHOT' + +application { + mainClass = 'uploader' +} repositories { mavenCentral() @@ -22,4 +27,4 @@ dependencies { test { useJUnitPlatform() -} \ No newline at end of file +} diff --git a/data-upload/gradlew b/data-upload/gradlew old mode 100644 new mode 100755 index 4f906e0c..fbd7c515 --- a/data-upload/gradlew +++ b/data-upload/gradlew @@ -130,7 +130,7 @@ fi if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then APP_HOME=`cygpath --path --mixed "$APP_HOME"` CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` - + JAVACMD=`cygpath --unix "$JAVACMD"` # We build the pattern for arguments to be converted via cygpath diff --git a/data-upload/gradlew.bat b/data-upload/gradlew.bat index 107acd32..a9f778a7 100644 --- a/data-upload/gradlew.bat +++ b/data-upload/gradlew.bat @@ -1,89 +1,104 @@ -@rem -@rem Copyright 2015 the original author or authors. -@rem -@rem Licensed under the Apache License, Version 2.0 (the "License"); -@rem you may not use this file except in compliance with the License. -@rem You may obtain a copy of the License at -@rem -@rem https://www.apache.org/licenses/LICENSE-2.0 -@rem -@rem Unless required by applicable law or agreed to in writing, software -@rem distributed under the License is distributed on an "AS IS" BASIS, -@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -@rem See the License for the specific language governing permissions and -@rem limitations under the License. -@rem - -@if "%DEBUG%" == "" @echo off -@rem ########################################################################## -@rem -@rem Gradle startup script for Windows -@rem -@rem ########################################################################## - -@rem Set local scope for the variables with windows NT shell -if "%OS%"=="Windows_NT" setlocal - -set DIRNAME=%~dp0 -if "%DIRNAME%" == "" set DIRNAME=. -set APP_BASE_NAME=%~n0 -set APP_HOME=%DIRNAME% - -@rem Resolve any "." and ".." in APP_HOME to make it shorter. -for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi - -@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" - -@rem Find java.exe -if defined JAVA_HOME goto findJavaFromJavaHome - -set JAVA_EXE=java.exe -%JAVA_EXE% -version >NUL 2>&1 -if "%ERRORLEVEL%" == "0" goto execute - -echo. -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:findJavaFromJavaHome -set JAVA_HOME=%JAVA_HOME:"=% -set JAVA_EXE=%JAVA_HOME%/bin/java.exe - -if exist "%JAVA_EXE%" goto execute - -echo. -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:execute -@rem Setup the command line - -set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar - - -@rem Execute Gradle -"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* - -:end -@rem End local scope for the variables with windows NT shell -if "%ERRORLEVEL%"=="0" goto mainEnd - -:fail -rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of -rem the _cmd.exe /c_ return code! -if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 -exit /b 1 - -:mainEnd -if "%OS%"=="Windows_NT" endlocal - -:omega +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto init + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto init + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:init +@rem Get command-line arguments, handling Windows variants + +if not "%OS%" == "Windows_NT" goto win9xME_args + +:win9xME_args +@rem Slurp the command line arguments. +set CMD_LINE_ARGS= +set _SKIP=2 + +:win9xME_args_slurp +if "x%~1" == "x" goto execute + +set CMD_LINE_ARGS=%* + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/data-upload/src/main/java/uploader.java b/data-upload/src/main/java/uploader.java index 98e48e7f..8ba0e9b6 100644 --- a/data-upload/src/main/java/uploader.java +++ b/data-upload/src/main/java/uploader.java @@ -12,16 +12,16 @@ import java.io.File; import java.io.FileReader; -import java.text.ParseException; import java.util.ArrayList; import java.util.List; +import java.util.UUID; import java.util.stream.Collectors; public class uploader { public static final String DBNAME = "techVault"; public static final ImmutableList COMPANIES = - ImmutableList.of("Linkedin", "Yelp", "Yahoo", "Twilio", "Stack", "AWS"); + ImmutableList.of("airbnb", "aws", "babble", "confluent", "criteo", "deepmind", "ebay", "facebook", "linkedin", "medium", "netflix", "nvidia", "quora", "slack", "stackoverflow", "twilio", "uber", "yahoo", "yelp"); private static List readFileToJsonString(File file) { JSONParser parser = new JSONParser(); @@ -29,10 +29,18 @@ private static List readFileToJsonString(File file) { try { Object obj = parser.parse(new FileReader(file)); JSONObject jsonObject = (JSONObject) obj; - JSONArray blogs = (JSONArray) jsonObject.get("Linkedin"); - for(Object blog: blogs.toArray()){ - JSONObject jsonObj = (JSONObject)blog; - list.add(jsonObj.toJSONString()); + for(String s : COMPANIES) { + if(jsonObject.containsKey(s)) { + JSONArray blogs = (JSONArray) jsonObject.get(s); + for (Object blog : blogs.toArray()) { + JSONObject jsonObj = (JSONObject) blog; + jsonObj.put("company", s); + final String uuid = UUID.randomUUID().toString().replace("-", ""); + jsonObj.put("uuid", uuid); + list.add(jsonObj.toJSONString()); + } + break; + } } } catch (Exception e) { e.printStackTrace(); @@ -43,21 +51,17 @@ private static List readFileToJsonString(File file) { public static void main(String[] args) { Options options = new Options(); - Option f = new Option("f", "file", true, "input file path"); - f.setRequired(true); - options.addOption(f); + Option directoryOption = new Option("d", "directory", true, "json file directory"); + directoryOption.setRequired(true); + options.addOption(directoryOption); - Option u = new Option("u", "user", true, "user name for mongodb"); - u.setRequired(true); - options.addOption(u); - - Option p = new Option("p", "password", true, "password"); - p.setRequired(true); - options.addOption(p); + Option endpointOption = new Option("e", "endpoint", true, "mongoDB endpoint, e.g., mongodb+srv://:@cluster0.0eph1.mongodb.net/?retryWrites=true&w=majority"); + endpointOption.setRequired(true); + options.addOption(endpointOption); CommandLineParser parser = new DefaultParser(); HelpFormatter formatter = new HelpFormatter(); - CommandLine cmd; + CommandLine cmd = null; try { cmd = parser.parse(options, args); @@ -68,20 +72,18 @@ public static void main(String[] args) { } Preconditions.checkNotNull(cmd); - String folderName = cmd.getOptionValue("folder"); - String username = cmd.getOptionValue("user"); - String password = cmd.getOptionValue("password"); + String directory = cmd.getOptionValue("directory"); + String endpoint = cmd.getOptionValue("endpoint"); - File folder = new File(folderName); + File folder = new File(directory); File[] listOfFiles = folder.listFiles(); Preconditions.checkNotNull(listOfFiles); - String endpoint = String.format("mongodb+srv://%s:%s@cluster0.0eph1.mongodb.net/%s?retryWrites=true&w=majority", username, password, DBNAME); MongoClient mongoClient = MongoClients.create(endpoint); MongoDatabase database = mongoClient.getDatabase(DBNAME); - MongoCollection collection = database.getCollection("collection"); + MongoCollection collection = database.getCollection("blogs"); - for(File file : listOfFiles) { + for (File file : listOfFiles) { List docs = readFileToJsonString(file).stream().map(Document::parse).collect(Collectors.toList()); collection.insertMany(docs); }