From b3e0c3911c6e02b7407b8bad7fd6907814bb106b Mon Sep 17 00:00:00 2001 From: "Nicholaus E. Halecky" Date: Fri, 15 Feb 2013 11:16:07 -0800 Subject: [PATCH 1/3] BUG/ENH: Fixed decode issue in get_data_yahoo() --- pandas/io/data.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/pandas/io/data.py b/pandas/io/data.py index 5e92fcaa62427..06ab8439a6b01 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -14,6 +14,7 @@ from zipfile import ZipFile from pandas.util.py3compat import StringIO, BytesIO, bytes_to_str +import pandas as pd from pandas import Panel, DataFrame, Series, read_csv, concat from pandas.io.parsers import TextParser @@ -115,7 +116,7 @@ def get_quote_yahoo(symbols): return None for line in lines: - fields = line.strip().split(',') + fields = line.decode().strip().split(',') for i, field in enumerate(fields): if field[-2:] == '%"': data[header[i]].append(float(field.strip('"%'))) @@ -133,7 +134,7 @@ def get_quote_yahoo(symbols): def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, - pause=0): + pause=0, **kwargs): """ Get historical data for the given name from yahoo. Date format is datetime @@ -195,11 +196,22 @@ def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']): def _calc_return_index(price_df): """ - Return a returns index from a input price df or series. + Return a returns index from a input price df or series. Intial value + (typically NaN) is set to 1. """ - - ret_index = price_df.pct_change().add(1).cumprod() - ret_index.ix[0] = 1 + df = price_df.pct_change().add(1).cumprod() + mask = ~df.ix[1].isnull() & df.ix[0].isnull() + df.ix[0][mask] = 1 + + #Check for first stock listings after starting date of index in ret_index + #If True, find first_valid_index and set previous entry to 1. + if(~mask).any: + for sym in mask.index[~mask]: + tstamp = df[sym].first_valid_index() + t_idx = df.index.get_loc(tstamp) - 1 + df[sym].ix[t_idx] = 1 + + ret_index = df return ret_index @@ -241,7 +253,7 @@ def get_components_yahoo(idx_sym): #break when no new components are found while (True in mask): urlStr = url.format(idx_mod, stats, comp_idx) - lines = (urllib.urlopen(urlStr).read().strip(). + lines = (urllib.urlopen(urlStr).read().decode().strip(). strip('"').split('"\r\n"')) lines = [line.strip().split('","') for line in lines] @@ -258,7 +270,8 @@ def get_components_yahoo(idx_sym): def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0, - adjust_price=False, ret_index=False, chunksize=25, **kwargs): + adjust_price=False, ret_index=False, chunksize=25, + log_info=False, **kwargs): """ Returns DataFrame/Panel of historical stock prices from symbols, over date range, start to end. To avoid being penalized by Yahoo! Finance servers, @@ -266,8 +279,8 @@ def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0, Parameters ---------- - symbols : string, list-like object (list, tupel, Series), or DataFrame - Single stock symbol (ticker), list-like object of symbols or + symbols : string, array-like object (list, tupel, Series), or DataFrame + Single stock symbol (ticker), array-like object of symbols or DataFrame with index containing stock symbols. start : string, (defaults to '1/1/2010') Starting date, timestamp. Parses many different kind of date @@ -290,7 +303,7 @@ def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0, Returns ------- - hist_data : DataFrame (str) or Panel (list-like object, DataFrame) + hist_data : DataFrame (str) or Panel (array-like object, DataFrame) """ def dl_mult_symbols(symbols): From ecea4015efe93d13a71706833da5bb342c081899 Mon Sep 17 00:00:00 2001 From: "Nicholaus E. Halecky" Date: Fri, 15 Feb 2013 11:18:17 -0800 Subject: [PATCH 2/3] TST: Better tests for yahoo finance features. --- pandas/io/tests/test_yahoo.py | 48 +++++++++++------------------------ 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py index 2d56260abebbd..2e339b4839ef3 100644 --- a/pandas/io/tests/test_yahoo.py +++ b/pandas/io/tests/test_yahoo.py @@ -43,14 +43,12 @@ def test_yahoo(self): raise - @slow @network def test_get_quote(self): df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG'])) assert_series_equal(df.ix[0], df.ix[2]) - @slow @network def test_get_components(self): @@ -71,10 +69,9 @@ def test_get_components(self): assert 'GOOG' in df.index assert 'AMZN' in df.index - @slow + @network def test_get_data(self): - import numpy as np #single symbol #http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d df = web.get_data_yahoo('GOOG') @@ -83,8 +80,6 @@ def test_get_data(self): sl = ['AAPL', 'AMZN', 'GOOG'] pan = web.get_data_yahoo(sl, '2012') ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG] - # the provider results are subject to change, disabled. GH2847 - # assert result == expected assert ts[0].dayofyear == 96 dfi = web.get_components_yahoo('^DJI') @@ -93,33 +88,20 @@ def test_get_data(self): result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist() assert result == expected - pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12', - adjust_price=True) - expected = [18.38, 27.45, 24.54] - result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist() - # the provider results are subject to change, disabled. GH2847 - # assert result == expected - - # sanity checking - t= np.array(result) - assert np.issubdtype(t.dtype, np.floating) - assert t.shape == (3,) - - pan = web.get_data_yahoo(dfi, '2011', ret_index=True) - d = [[ 1.01757469, 1.01130524, 1.02414183], - [ 1.00292912, 1.00770812, 1.01735194], - [ 1.00820152, 1.00462487, 1.01320257], - [ 1.08025776, 0.99845838, 1.00113165]] - - expected = pd.DataFrame(d) - result = pan.Ret_Index.ix['01-18-11':'01-21-11'][['GE', 'INTC', 'MSFT']] - # the provider results are subject to change, disabled. GH2847 - # assert_almost_equal(result.values, expected.values) - - # sanity checking - t= np.array(result) - assert np.issubdtype(t.dtype, np.floating) - assert t.shape == (4, 3) + expected = [[ 18.99, 28.4 , 25.18], + [ 18.58, 28.31, 25.13], + [ 19.03, 28.16, 25.52], + [ 18.81, 28.82, 25.87]] + result = pan.Open.ix['Jan-15-12':'Jan-20-12'][['GE', 'MSFT', 'INTC']].values + assert (result == expected).all() + + #Check ret_index + pan = web.get_data_yahoo(['GE', 'INTC', 'IBM'], '1977', '1987', + ret_index=True) + tstamp = pan.Ret_Index.INTC.first_valid_index() + result = pan.Ret_Index.ix[tstamp]['INTC'] + expected = 1.0 + assert result == expected if __name__ == '__main__': From d4f9c4c97d56c5123b51955a9a3a4821fc7d2e5b Mon Sep 17 00:00:00 2001 From: "Nicholaus E. Halecky" Date: Sat, 16 Feb 2013 01:29:49 -0800 Subject: [PATCH 3/3] TST: Included sanity tests in test_yahoo.py --- pandas/io/data.py | 6 +++--- pandas/io/tests/test_yahoo.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/pandas/io/data.py b/pandas/io/data.py index 06ab8439a6b01..1acf76bc1169d 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -116,7 +116,7 @@ def get_quote_yahoo(symbols): return None for line in lines: - fields = line.decode().strip().split(',') + fields = line.decode('utf-8').strip().split(',') for i, field in enumerate(fields): if field[-2:] == '%"': data[header[i]].append(float(field.strip('"%'))) @@ -253,7 +253,7 @@ def get_components_yahoo(idx_sym): #break when no new components are found while (True in mask): urlStr = url.format(idx_mod, stats, comp_idx) - lines = (urllib.urlopen(urlStr).read().decode().strip(). + lines = (urllib.urlopen(urlStr).read().decode('utf-8').strip(). strip('"').split('"\r\n"')) lines = [line.strip().split('","') for line in lines] @@ -271,7 +271,7 @@ def get_components_yahoo(idx_sym): def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0, adjust_price=False, ret_index=False, chunksize=25, - log_info=False, **kwargs): + **kwargs): """ Returns DataFrame/Panel of historical stock prices from symbols, over date range, start to end. To avoid being penalized by Yahoo! Finance servers, diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py index 2e339b4839ef3..69994e5e75d9c 100644 --- a/pandas/io/tests/test_yahoo.py +++ b/pandas/io/tests/test_yahoo.py @@ -43,12 +43,14 @@ def test_yahoo(self): raise + @slow @network def test_get_quote(self): df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG'])) assert_series_equal(df.ix[0], df.ix[2]) + @slow @network def test_get_components(self): @@ -70,8 +72,10 @@ def test_get_components(self): assert 'AMZN' in df.index + @slow @network def test_get_data(self): + import numpy as np #single symbol #http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d df = web.get_data_yahoo('GOOG') @@ -88,6 +92,11 @@ def test_get_data(self): result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist() assert result == expected + # sanity checking + t= np.array(result) + assert np.issubdtype(t.dtype, np.floating) + assert t.shape == (3,) + expected = [[ 18.99, 28.4 , 25.18], [ 18.58, 28.31, 25.13], [ 19.03, 28.16, 25.52], @@ -103,6 +112,10 @@ def test_get_data(self): expected = 1.0 assert result == expected + # sanity checking + t= np.array(result) + assert np.issubdtype(t.dtype, np.floating) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],