|
1 | 1 | import urllib.request |
2 | 2 | import os |
3 | | -import hashlib |
4 | | -import tarfile |
| 3 | +import csv |
| 4 | +import collections |
5 | 5 |
|
6 | | -# Constants |
7 | | -TARGET_TAR_FILE = "downloaded_files.tar.gz" |
8 | | -EXPECTED_MD5_CHECKSUM = "d41d8cd98f00b204e9800998ecf8427e" |
9 | 6 |
|
10 | | - |
11 | | -def task_func(url): |
| 7 | +def task_func(url, column_name, csv_file_path): |
12 | 8 | """ |
13 | | - Downloads a tar.gz file from a specified URL, then validates its MD5 checksum against a predefined expected value. |
14 | | - If the checksum matches, it extracts the contents of the tar.gz file. Otherwise, it deletes the downloaded file. |
| 9 | + Download a CSV file from a given URL, save it to a specified path, and count |
| 10 | + the occurrences of each value in a particular column. The function handles various |
| 11 | + scenarios including missing columns and file download errors. |
15 | 12 |
|
16 | 13 | Parameters: |
17 | | - url (str): The URL from which to download the tar.gz file. |
| 14 | + url (str): The URL of the CSV file to be downloaded. Must be a valid and accessible URL. |
| 15 | + column_name (str): The name of the column in the CSV file whose values are to be counted. |
| 16 | + The function will raise a ValueError if this column is not found. |
| 17 | + csv_file_path (str): The file path where the downloaded CSV file will be saved. |
| 18 | + If a file already exists at this path, it will be overwritten. |
18 | 19 |
|
19 | 20 | Returns: |
20 | | - bool: Returns True if the file is successfully downloaded, its MD5 checksum matches the expected value, and |
21 | | - it is extracted. Returns False if the checksum does not match the expected value or if the download fails. |
| 21 | + dict: A dictionary mapping the values from the specified column to their |
| 22 | + corresponding occurrence counts. |
| 23 | +
|
| 24 | + Raises: |
| 25 | + ValueError: If the specified column_name does not exist in the CSV file, the function |
| 26 | + will delete the downloaded file and raise a ValueError with a message |
| 27 | + stating "The provided column_name '{column_name}' does not exist in the CSV file." |
22 | 28 |
|
23 | 29 | Requirements: |
24 | | - - urllib.request |
25 | | - - hashlib |
26 | | - - tarfile |
| 30 | + - urllib |
27 | 31 | - os |
| 32 | + - csv |
| 33 | + - collections |
28 | 34 |
|
29 | 35 | Example: |
30 | | - >>> task_func('http://example.com/files.tar.gz') |
31 | | - True |
| 36 | + >>> task_func('http://example.com/data.csv', 'category', 'downloaded_data.csv') |
| 37 | + {'cat1': 5, 'cat2': 3, 'cat3': 8} |
| 38 | + # This is a hypothetical output; the actual output will depend on the CSV data. |
| 39 | +
|
| 40 | + Notes: |
| 41 | + - The downloaded CSV file is deleted after its contents have been processed. |
| 42 | + - The function only counts values in the specified column and ignores other data. |
32 | 43 | """ |
33 | | - try: |
34 | | - urllib.request.urlretrieve(url, TARGET_TAR_FILE) |
35 | | - except Exception as e: |
36 | | - print(e) |
37 | | - return False |
38 | | - md5_hash = hashlib.md5() |
39 | | - with open(TARGET_TAR_FILE, "rb") as f: |
40 | | - for byte_block in iter(lambda: f.read(4096), b""): |
41 | | - md5_hash.update(byte_block) |
42 | | - if md5_hash.hexdigest() != EXPECTED_MD5_CHECKSUM: |
43 | | - os.remove(TARGET_TAR_FILE) |
44 | | - return False |
45 | | - with tarfile.open(TARGET_TAR_FILE, "r:gz") as tar_ref: |
46 | | - tar_ref.extractall() |
47 | | - os.remove(TARGET_TAR_FILE) |
48 | | - return True |
| 44 | + urllib.request.urlretrieve(url, csv_file_path) |
| 45 | + with open(csv_file_path, "r", encoding="utf-8") as f: |
| 46 | + reader = csv.DictReader(f) |
| 47 | + if column_name not in reader.fieldnames: |
| 48 | + os.remove(csv_file_path) |
| 49 | + raise ValueError( |
| 50 | + f"The provided column_name '{column_name}' does not exist in the CSV file." |
| 51 | + ) |
| 52 | + values = [row[column_name] for row in reader] |
| 53 | + os.remove(csv_file_path) |
| 54 | + return collections.Counter(values) |
49 | 55 |
|
50 | 56 | import unittest |
51 | | -from unittest.mock import patch |
52 | | -import urllib.request |
53 | | -import hashlib |
| 57 | +from unittest.mock import patch, mock_open |
54 | 58 | import os |
55 | | -# Constants from the task_func function |
56 | | -TARGET_TAR_FILE = "downloaded_files.tar.gz" |
57 | | -EXPECTED_MD5_CHECKSUM = "d41d8cd98f00b204e9800998ecf8427e" |
58 | 59 | class TestCases(unittest.TestCase): |
59 | 60 | """Test cases for the task_func function.""" |
60 | | - def setUp(self): |
61 | | - self.valid_url = "http://example.com/valid.tar.gz" |
62 | | - self.invalid_checksum_url = "http://example.com/invalid_checksum.tar.gz" |
63 | | - # Create a minimal tar.gz file to simulate download |
64 | | - with open("test_file.txt", "w") as f: |
65 | | - f.write("test data") |
66 | | - with tarfile.open(TARGET_TAR_FILE, "w:gz") as tar: |
67 | | - tar.add("test_file.txt") |
68 | | - def test_valid_file(self): |
69 | | - """Test that a valid file is downloaded, its checksum is validated, and it is extracted.""" |
70 | | - with patch("urllib.request.urlretrieve"), patch("hashlib.md5") as mock_md5: |
71 | | - mock_md5.return_value.hexdigest.return_value = EXPECTED_MD5_CHECKSUM |
72 | | - result = task_func(self.valid_url) |
73 | | - self.assertTrue(result) |
74 | | - self.assertFalse(os.path.exists(TARGET_TAR_FILE)) |
75 | | - def test_invalid_checksum_valid_format(self): |
76 | | - """Test that a file with an invalid checksum is not extracted.""" |
77 | | - with patch("urllib.request.urlretrieve"), patch("hashlib.md5") as mock_md5: |
78 | | - mock_md5.return_value.hexdigest.return_value = "invalidchecksum" |
79 | | - result = task_func(self.invalid_checksum_url) |
80 | | - self.assertFalse(result) |
81 | | - self.assertFalse(os.path.exists(TARGET_TAR_FILE)) |
82 | | - def test_download_failure(self): |
83 | | - """Test that a file that fails to download is not extracted.""" |
84 | | - with patch( |
85 | | - "urllib.request.urlretrieve", side_effect=Exception("Download failed") |
86 | | - ): |
87 | | - result = task_func(self.valid_url) |
88 | | - self.assertFalse(result) |
89 | | - def test_file_removal_after_failure(self): |
90 | | - """Test that a file that fails to download is removed.""" |
91 | | - with patch("urllib.request.urlretrieve"), patch("hashlib.md5") as mock_md5: |
92 | | - mock_md5.return_value.hexdigest.return_value = "invalidchecksum" |
93 | | - task_func(self.invalid_checksum_url) |
94 | | - self.assertFalse(os.path.exists(TARGET_TAR_FILE)) |
95 | | - def test_extraction_success(self): |
96 | | - """Test that a file is extracted if its checksum is valid.""" |
97 | | - with patch("urllib.request.urlretrieve"), patch("hashlib.md5") as mock_md5: |
98 | | - mock_md5.return_value.hexdigest.return_value = EXPECTED_MD5_CHECKSUM |
99 | | - result = task_func(self.valid_url) |
100 | | - self.assertTrue(result) |
101 | | - def tearDown(self): |
102 | | - # Clean up any created files |
103 | | - if os.path.exists(TARGET_TAR_FILE): |
104 | | - os.remove(TARGET_TAR_FILE) |
105 | | - if os.path.exists("test_file.txt"): |
106 | | - os.remove("test_file.txt") |
| 61 | + @patch("os.remove") |
| 62 | + @patch("urllib.request.urlretrieve") |
| 63 | + @patch( |
| 64 | + "builtins.open", |
| 65 | + new_callable=mock_open, |
| 66 | + read_data="category,other\n" + "cat1,x\n" * 2 + "cat2,y\n" * 2 + "cat3,z\n", |
| 67 | + ) |
| 68 | + def test_count_categories_data1(self, mock_file, mock_urlretrieve, mock_remove): |
| 69 | + """Test that the function counts the occurrences of each category in the CSV file.""" |
| 70 | + result = task_func("mock_url", "category", "/mock/path/data1.csv") |
| 71 | + self.assertEqual(result, {"cat1": 2, "cat2": 2, "cat3": 1}) |
| 72 | + @patch("os.remove") |
| 73 | + @patch("urllib.request.urlretrieve") |
| 74 | + @patch( |
| 75 | + "builtins.open", |
| 76 | + new_callable=mock_open, |
| 77 | + read_data="name,other\n" + "Alice,x\n" * 2 + "Bob,y\n" + "Charlie,z\n", |
| 78 | + ) |
| 79 | + def test_count_names_data2(self, mock_file, mock_urlretrieve, mock_remove): |
| 80 | + """Test that the function counts the occurrences of each name in the CSV file.""" |
| 81 | + result = task_func("mock_url", "name", "/mock/path/data2.csv") |
| 82 | + self.assertEqual(result, {"Alice": 2, "Bob": 1, "Charlie": 1}) |
| 83 | + @patch("os.remove") |
| 84 | + @patch("urllib.request.urlretrieve") |
| 85 | + @patch( |
| 86 | + "builtins.open", |
| 87 | + new_callable=mock_open, |
| 88 | + read_data="category,other\n" + "cat1,x\n" * 2 + "cat2,y\n" + "cat3,z\n" * 2, |
| 89 | + ) |
| 90 | + def test_count_categories_data3(self, mock_file, mock_urlretrieve, mock_remove): |
| 91 | + """Test that the function counts the occurrences of each category in the CSV file.""" |
| 92 | + result = task_func("mock_url", "category", "/mock/path/data3.csv") |
| 93 | + self.assertEqual(result, {"cat1": 2, "cat2": 1, "cat3": 2}) |
| 94 | + @patch("os.remove") |
| 95 | + @patch("urllib.request.urlretrieve") |
| 96 | + @patch( |
| 97 | + "builtins.open", |
| 98 | + new_callable=mock_open, |
| 99 | + read_data="name,other\n" + "Alice,x\n" * 3 + "Bob,y\n" + "Charlie,z\n", |
| 100 | + ) |
| 101 | + def test_count_names_data3(self, mock_file, mock_urlretrieve, mock_remove): |
| 102 | + """Test that the function counts the occurrences of each name in the CSV file.""" |
| 103 | + result = task_func("mock_url", "name", "/mock/path/data3.csv") |
| 104 | + self.assertEqual(result, {"Alice": 3, "Bob": 1, "Charlie": 1}) |
| 105 | + @patch("os.remove") |
| 106 | + @patch("urllib.request.urlretrieve") |
| 107 | + @patch( |
| 108 | + "builtins.open", |
| 109 | + new_callable=mock_open, |
| 110 | + read_data="name,other\n" + "Alice,x\n" * 3 + "Bob,y\n" + "Charlie,z\n", |
| 111 | + ) |
| 112 | + def test_non_existent_column(self, mock_file, mock_urlretrieve, mock_remove): |
| 113 | + """Test that the function raises an exception when the specified column does not exist.""" |
| 114 | + with self.assertRaises(ValueError): |
| 115 | + task_func("mock_url", "non_existent_column", "/mock/path/data3.csv") |
0 commit comments