|
1 | 1 | #include "get_tranco.hpp" |
2 | 2 | #include "../utils/utils.hpp" |
3 | 3 |
|
| 4 | +#include <curl/curl.h> |
| 5 | +#include <fstream> |
4 | 6 | #include <regex> |
5 | 7 |
|
6 | 8 | namespace duckdb |
7 | 9 | { |
8 | 10 | namespace netquack |
9 | 11 | { |
| 12 | + // Function to get the download code for the Tranco list |
| 13 | + std::string GetTrancoDownloadCode(char *date) |
| 14 | + { |
| 15 | + CURL *curl; |
| 16 | + CURLcode res; |
| 17 | + std::string readBuffer; |
| 18 | + |
| 19 | + // Construct the URL for the daily list |
| 20 | + std::string url = "https://tranco-list.eu/daily_list?date=" + std::string(date) + "&subdomains=true"; |
| 21 | + |
| 22 | + LogMessage("INFO", "Get Tranco download code for date: " + std::string(date)); |
| 23 | + |
| 24 | + curl = curl_easy_init(); |
| 25 | + if (curl) |
| 26 | + { |
| 27 | + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); |
| 28 | + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); // Follow redirects |
| 29 | + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); |
| 30 | + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer); |
| 31 | + res = curl_easy_perform(curl); |
| 32 | + curl_easy_cleanup(curl); |
| 33 | + |
| 34 | + if (res != CURLE_OK) |
| 35 | + { |
| 36 | + throw std::runtime_error("Failed to fetch Tranco download code."); |
| 37 | + } |
| 38 | + } |
| 39 | + |
| 40 | + // Extract the download code from the URL |
| 41 | + std::regex code_regex(R"(Information on the Tranco list with ID ([A-Z0-9]+))"); |
| 42 | + std::smatch code_match; |
| 43 | + if (std::regex_search(readBuffer, code_match, code_regex) && code_match.size() > 1) |
| 44 | + { |
| 45 | + LogMessage("INFO", "Tranco download code: " + code_match[1].str()); |
| 46 | + return code_match[1].str(); |
| 47 | + } |
| 48 | + |
| 49 | + throw std::runtime_error("Failed to extract Tranco download code."); |
| 50 | + } |
| 51 | + |
| 52 | + // Function to download the Tranco list and create a table |
| 53 | + void LoadTrancoList(DatabaseInstance &db, bool force) |
| 54 | + { |
| 55 | + // Get yesterday's date in YYYY-MM-DD format |
| 56 | + std::time_t now = std::time(nullptr); |
| 57 | + std::tm *yesterday = std::localtime(&now); |
| 58 | + yesterday->tm_mday -= 1; // Subtract one day |
| 59 | + std::mktime(yesterday); // Normalize the time |
| 60 | + char date[11]; |
| 61 | + std::strftime(date, sizeof(date), "%Y-%m-%d", yesterday); |
| 62 | + |
| 63 | + // Construct the file name |
| 64 | + std::string temp_file = "tranco_list_" + std::string(date) + ".csv"; |
| 65 | + |
| 66 | + // Download the file if it doesn't exist or if force is true |
| 67 | + std::ifstream file(temp_file); |
| 68 | + if (force) |
| 69 | + { |
| 70 | + // Remove the old file if it exists |
| 71 | + if (file.good()) |
| 72 | + { |
| 73 | + remove(temp_file.c_str()); |
| 74 | + } |
| 75 | + // Get the download code |
| 76 | + std::string download_code = GetTrancoDownloadCode(date); |
| 77 | + |
| 78 | + // Construct the download URL |
| 79 | + std::string download_url = "https://tranco-list.eu/download/" + download_code + "/full"; |
| 80 | + |
| 81 | + LogMessage("INFO", "Download Tranco list: " + download_url); |
| 82 | + |
| 83 | + // Download the CSV file to a temporary file |
| 84 | + CURL *curl; |
| 85 | + CURLcode res; |
| 86 | + FILE *file = fopen(temp_file.c_str(), "wb"); |
| 87 | + if (!file) |
| 88 | + { |
| 89 | + throw std::runtime_error("Failed to create temporary file for Tranco list."); |
| 90 | + } |
| 91 | + |
| 92 | + curl = curl_easy_init(); |
| 93 | + if (curl) |
| 94 | + { |
| 95 | + curl_easy_setopt(curl, CURLOPT_URL, download_url.c_str()); |
| 96 | + curl_easy_setopt(curl, CURLOPT_WRITEDATA, file); |
| 97 | + res = curl_easy_perform(curl); |
| 98 | + curl_easy_cleanup(curl); |
| 99 | + fclose(file); |
| 100 | + |
| 101 | + if (res != CURLE_OK) |
| 102 | + { |
| 103 | + remove(temp_file.c_str()); // Clean up the temporary file |
| 104 | + throw std::runtime_error("Failed to download Tranco list."); |
| 105 | + } |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + if (!file.good()) |
| 110 | + { |
| 111 | + LogMessage("ERROR", "Tranco list not found. Download it first using `SELECT update_tranco(true);`"); |
| 112 | + } |
| 113 | + |
| 114 | + // Parse the CSV data and insert into a table |
| 115 | + LogMessage("INFO", "Inserting Tranco list into table"); |
| 116 | + |
| 117 | + Connection con(db); |
| 118 | + con.Query("CREATE OR REPLACE TABLE tranco_list AS SELECT * FROM read_csv('" + temp_file + "', header=false, columns={'rank': 'INTEGER', 'domain': 'VARCHAR'})"); |
| 119 | + } |
| 120 | + |
10 | 121 | // Function to update the Tranco list table |
11 | 122 | void UpdateTrancoListFunction(DataChunk &args, ExpressionState &state, Vector &result) |
12 | 123 | { |
|
0 commit comments