88@desc: 习题27.1 基于双向LSTM的ELMo预训练语言模型,假设下游任务是文本分类
99"""
1010import csv
11+
1112import os
1213import time
1314
@@ -28,52 +29,51 @@ def to_map_style_dataset(iter_data):
2829 return list (iter_data )
2930
3031
31- def download_file (filename , filepath ):
32- base_urls = [
33- "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/" ,
34- "https://ghproxy.net/https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/" ,
35- "https://fastly.jsdelivr.net/gh/mhjabreel/CharCnn_Keras@master/data/ag_news_csv/"
36- ]
37-
38- print (f"Attempting to download { filename } ..." )
39-
40- for base_url in base_urls :
41- url = base_url + filename
42- print (f"Trying { url } ..." )
43- try :
44- try :
45- response = requests .get (url , stream = True , timeout = 10 )
46- except requests .exceptions .SSLError :
47- print (f"SSL Error with { url } , trying without verification..." )
48- response = requests .get (url , stream = True , timeout = 10 , verify = False )
49-
50- if response .status_code == 200 :
51- with open (filepath , 'wb' ) as f :
52- for chunk in response .iter_content (chunk_size = 8192 ):
53- f .write (chunk )
54- print (f"Downloaded successfully from { url } " )
55- return
56- else :
57- print (f"Failed to download from { url } , status code: { response .status_code } " )
58- except Exception as e :
59- print (f"Error downloading from { url } : { e } " )
60-
61- # If all mirrors fail
62- raise RuntimeError (
63- f"Failed to download { filename } from all mirrors.\n "
64- f"Please manually download 'train.csv' and 'test.csv' from "
65- f"https://github.com/mhjabreel/CharCnn_Keras/tree/master/data/ag_news_csv "
66- f"and place them in { os .path .dirname (filepath )} "
67- )
68-
69-
7032def AG_NEWS (root = './data' ):
7133 base_path = os .path .join (root , 'datasets' , 'AG_NEWS' )
7234 os .makedirs (base_path , exist_ok = True )
7335
7436 train_path = os .path .join (base_path , 'train.csv' )
7537 test_path = os .path .join (base_path , 'test.csv' )
7638
39+ def download_file (filename , filepath ):
40+ base_urls = [
41+ "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/" ,
42+ "https://ghproxy.net/https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/" ,
43+ "https://fastly.jsdelivr.net/gh/mhjabreel/CharCnn_Keras@master/data/ag_news_csv/"
44+ ]
45+
46+ print (f"Attempting to download { filename } ..." )
47+
48+ for base_url in base_urls :
49+ url = base_url + filename
50+ print (f"Trying { url } ..." )
51+ try :
52+ try :
53+ response = requests .get (url , stream = True , timeout = 10 )
54+ except requests .exceptions .SSLError :
55+ print (f"SSL Error with { url } , trying without verification..." )
56+ response = requests .get (url , stream = True , timeout = 10 , verify = False )
57+
58+ if response .status_code == 200 :
59+ with open (filepath , 'wb' ) as f :
60+ for chunk in response .iter_content (chunk_size = 8192 ):
61+ f .write (chunk )
62+ print (f"Downloaded successfully from { url } " )
63+ return
64+ else :
65+ print (f"Failed to download from { url } , status code: { response .status_code } " )
66+ except Exception as e :
67+ print (f"Error downloading from { url } : { e } " )
68+
69+ # If all mirrors fail
70+ raise RuntimeError (
71+ f"Failed to download { filename } from all mirrors.\n "
72+ f"Please manually download 'train.csv' and 'test.csv' from "
73+ f"https://github.com/mhjabreel/CharCnn_Keras/tree/master/data/ag_news_csv "
74+ f"and place them in { os .path .dirname (filepath )} "
75+ )
76+
7777 if not os .path .exists (train_path ):
7878 download_file ("train.csv" , train_path )
7979
@@ -101,10 +101,10 @@ def get_elmo_model():
101101 elmo_options_file = './data/elmo_2x1024_128_2048cnn_1xhighway_options.json'
102102 elmo_weight_file = './data/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
103103 url = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
104- if ( not os .path .exists (elmo_options_file ) ):
104+ if not os .path .exists (elmo_options_file ):
105105 wget .download (url , elmo_options_file )
106106 url = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
107- if ( not os .path .exists (elmo_weight_file ) ):
107+ if not os .path .exists (elmo_weight_file ):
108108 wget .download (url , elmo_weight_file )
109109
110110 print ("Initializing ELMo model..." )
0 commit comments