|
89 | 89 |
|
90 | 90 |
|
91 | 91 | def _download_and_clean(dataset, data_dir):
|
92 |
| - """Download MovieLens dataset in a standard format. |
| 92 | + """Download the MovieLens dataset in a standard format. |
93 | 93 |
|
94 | 94 | This function downloads the specified MovieLens format and coerces it into a
|
95 | 95 | standard format. The only difference between the ml-1m and ml-20m datasets
|
@@ -148,10 +148,10 @@ def _transform_csv(input_path, output_path, names, skip_first, separator=","):
|
148 | 148 |
|
149 | 149 | Args:
|
150 | 150 | input_path: The path of the raw csv.
|
151 |
| - output_path: The path of the cleaned csv. |
152 |
| - names: The csv column names. |
153 |
| - skip_first: Boolean of whether to skip the first line of the raw csv. |
154 |
| - separator: Character used to separate fields in the raw csv. |
| 151 | + output_path: The location of the cleaned csv file. |
| 152 | + names: The names of the csv columns. |
| 153 | + skip_first: Boolean indicating whether the first line of the raw csv should be skipped. |
| 154 | + separator: A character used in raw csv to separate fields. |
155 | 155 | """
|
156 | 156 | if six.PY2:
|
157 | 157 | names = [six.ensure_text(n, "utf-8") for n in names]
|
@@ -179,17 +179,17 @@ def _regularize_1m_dataset(temp_dir):
|
179 | 179 | ratings.dat
|
180 | 180 | The file has no header row, and each line is in the following format:
|
181 | 181 | UserID::MovieID::Rating::Timestamp
|
182 |
| - - UserIDs range from 1 and 6040 |
183 |
| - - MovieIDs range from 1 and 3952 |
| 182 | + - UserIDs range between 1 and 6040 |
| 183 | + - MovieIDs can range between 1 and 3952 |
184 | 184 | - Ratings are made on a 5-star scale (whole-star ratings only)
|
185 |
| - - Timestamp is represented in seconds since midnight Coordinated Universal |
| 185 | + - Timestamp is represented in seconds since midnight. Coordinated Universal |
186 | 186 | Time (UTC) of January 1, 1970.
|
187 | 187 | - Each user has at least 20 ratings
|
188 | 188 |
|
189 | 189 | movies.dat
|
190 | 190 | Each line has the following format:
|
191 | 191 | MovieID::Title::Genres
|
192 |
| - - MovieIDs range from 1 and 3952 |
| 192 | + - MovieIDs can range between 1 and 3952 |
193 | 193 | """
|
194 | 194 | working_dir = os.path.join(temp_dir, ML_1M)
|
195 | 195 |
|
@@ -223,7 +223,7 @@ def _regularize_20m_dataset(temp_dir):
|
223 | 223 | movies.csv
|
224 | 224 | Each line has the following format:
|
225 | 225 | MovieID,Title,Genres
|
226 |
| - - MovieIDs range from 1 and 3952 |
| 226 | + - MovieIDs can range between 1 and 3952 |
227 | 227 | """
|
228 | 228 | working_dir = os.path.join(temp_dir, ML_20M)
|
229 | 229 |
|
@@ -265,7 +265,7 @@ def csv_to_joint_dataframe(data_dir, dataset):
|
265 | 265 |
|
266 | 266 |
|
267 | 267 | def integerize_genres(dataframe):
|
268 |
| - """Replace genre string with a binary vector. |
| 268 | + """Replace the genre string with a binary vector. |
269 | 269 |
|
270 | 270 | Args:
|
271 | 271 | dataframe: a pandas dataframe of movie data.
|
@@ -308,7 +308,7 @@ def define_data_download_flags():
|
308 | 308 |
|
309 | 309 |
|
310 | 310 | def main(_):
|
311 |
| - """Download and extract the data from GroupLens website.""" |
| 311 | + """Download and extract the data from the GroupLens website.""" |
312 | 312 | download(flags.FLAGS.dataset, flags.FLAGS.data_dir)
|
313 | 313 |
|
314 | 314 |
|
|
0 commit comments