From b3e0c3911c6e02b7407b8bad7fd6907814bb106b Mon Sep 17 00:00:00 2001
From: "Nicholaus E. Halecky" <nehalecky@gmail.com>
Date: Fri, 15 Feb 2013 11:16:07 -0800
Subject: [PATCH 1/3] BUG/ENH: Fixed decode issue in get_data_yahoo()

---
 pandas/io/data.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/pandas/io/data.py b/pandas/io/data.py
index 5e92fcaa62427..06ab8439a6b01 100644
--- a/pandas/io/data.py
+++ b/pandas/io/data.py
@@ -14,6 +14,7 @@
 from zipfile import ZipFile
 from pandas.util.py3compat import StringIO, BytesIO, bytes_to_str
 
+import pandas as pd
 from pandas import Panel, DataFrame, Series, read_csv, concat
 from pandas.io.parsers import TextParser
 
@@ -115,7 +116,7 @@ def get_quote_yahoo(symbols):
         return None
 
     for line in lines:
-        fields = line.strip().split(',')
+        fields = line.decode().strip().split(',')
         for i, field in enumerate(fields):
             if field[-2:] == '%"':
                 data[header[i]].append(float(field.strip('"%')))
@@ -133,7 +134,7 @@ def get_quote_yahoo(symbols):
 
 
 def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3,
-                    pause=0):
+                    pause=0, **kwargs):
     """
     Get historical data for the given name from yahoo.
     Date format is datetime
@@ -195,11 +196,22 @@ def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']):
 
 def _calc_return_index(price_df):
     """
-    Return a returns index from a input price df or series.
+    Return a returns index from a input price df or series. Intial value
+    (typically NaN) is set to 1.
     """
-
-    ret_index =  price_df.pct_change().add(1).cumprod()
-    ret_index.ix[0] = 1
+    df = price_df.pct_change().add(1).cumprod()
+    mask = ~df.ix[1].isnull() & df.ix[0].isnull()
+    df.ix[0][mask] = 1
+
+    #Check for first stock listings after starting date of index in ret_index
+    #If True, find first_valid_index and set previous entry to 1.
+    if(~mask).any:
+        for sym in mask.index[~mask]:
+            tstamp = df[sym].first_valid_index()
+            t_idx = df.index.get_loc(tstamp) - 1
+            df[sym].ix[t_idx] = 1
+
+    ret_index = df
     return ret_index
 
 
@@ -241,7 +253,7 @@ def get_components_yahoo(idx_sym):
     #break when no new components are found
     while (True in mask):
         urlStr = url.format(idx_mod, stats,  comp_idx)
-        lines = (urllib.urlopen(urlStr).read().strip().
+        lines = (urllib.urlopen(urlStr).read().decode().strip().
                  strip('"').split('"\r\n"'))
 
         lines = [line.strip().split('","') for line in lines]
@@ -258,7 +270,8 @@ def get_components_yahoo(idx_sym):
 
 
 def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0,
-                   adjust_price=False, ret_index=False, chunksize=25, **kwargs):
+                   adjust_price=False, ret_index=False, chunksize=25,
+                   log_info=False, **kwargs):
     """
     Returns DataFrame/Panel of historical stock prices from symbols, over date
     range, start to end. To avoid being penalized by Yahoo! Finance servers,
@@ -266,8 +279,8 @@ def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0,
 
     Parameters
     ----------
-    symbols : string, list-like object (list, tupel, Series), or DataFrame
-        Single stock symbol (ticker), list-like object of symbols or
+    symbols : string, array-like object (list, tupel, Series), or DataFrame
+        Single stock symbol (ticker), array-like object of symbols or
         DataFrame with index containing stock symbols.
     start : string, (defaults to '1/1/2010')
         Starting date, timestamp. Parses many different kind of date
@@ -290,7 +303,7 @@ def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0,
 
     Returns
     -------
-    hist_data : DataFrame (str) or Panel (list-like object, DataFrame)
+    hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
     """
 
     def dl_mult_symbols(symbols):

From ecea4015efe93d13a71706833da5bb342c081899 Mon Sep 17 00:00:00 2001
From: "Nicholaus E. Halecky" <nehalecky@gmail.com>
Date: Fri, 15 Feb 2013 11:18:17 -0800
Subject: [PATCH 2/3] TST: Better tests for yahoo finance features.

---
 pandas/io/tests/test_yahoo.py | 48 +++++++++++------------------------
 1 file changed, 15 insertions(+), 33 deletions(-)

diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py
index 2d56260abebbd..2e339b4839ef3 100644
--- a/pandas/io/tests/test_yahoo.py
+++ b/pandas/io/tests/test_yahoo.py
@@ -43,14 +43,12 @@ def test_yahoo(self):
                 raise
 
 
-    @slow
     @network
     def test_get_quote(self):
         df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG']))
         assert_series_equal(df.ix[0], df.ix[2])
 
 
-    @slow
     @network
     def test_get_components(self):
 
@@ -71,10 +69,9 @@ def test_get_components(self):
         assert 'GOOG' in df.index
         assert 'AMZN' in df.index
 
-    @slow
+
     @network
     def test_get_data(self):
-        import numpy as np
         #single symbol
         #http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d
         df = web.get_data_yahoo('GOOG')
@@ -83,8 +80,6 @@ def test_get_data(self):
         sl = ['AAPL', 'AMZN', 'GOOG']
         pan = web.get_data_yahoo(sl, '2012')
         ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG]
-        # the provider results are subject to change, disabled. GH2847
-        # assert result == expected
         assert ts[0].dayofyear == 96
 
         dfi = web.get_components_yahoo('^DJI')
@@ -93,33 +88,20 @@ def test_get_data(self):
         result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
         assert result == expected
 
-        pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12',
-                                 adjust_price=True)
-        expected = [18.38, 27.45, 24.54]
-        result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
-        # the provider results are subject to change, disabled. GH2847
-        # assert result == expected
-
-        # sanity checking
-        t= np.array(result)
-        assert     np.issubdtype(t.dtype, np.floating)
-        assert     t.shape == (3,)
-
-        pan = web.get_data_yahoo(dfi, '2011', ret_index=True)
-        d = [[ 1.01757469,  1.01130524,  1.02414183],
-             [ 1.00292912,  1.00770812,  1.01735194],
-             [ 1.00820152,  1.00462487,  1.01320257],
-             [ 1.08025776,  0.99845838,  1.00113165]]
-
-        expected = pd.DataFrame(d)
-        result = pan.Ret_Index.ix['01-18-11':'01-21-11'][['GE', 'INTC', 'MSFT']]
-        # the provider results are subject to change, disabled. GH2847
-        # assert_almost_equal(result.values, expected.values)
-
-        # sanity checking
-        t= np.array(result)
-        assert     np.issubdtype(t.dtype, np.floating)
-        assert     t.shape == (4, 3)
+        expected = [[ 18.99,  28.4 ,  25.18],
+                    [ 18.58,  28.31,  25.13],
+                    [ 19.03,  28.16,  25.52],
+                    [ 18.81,  28.82,  25.87]]
+        result = pan.Open.ix['Jan-15-12':'Jan-20-12'][['GE', 'MSFT', 'INTC']].values
+        assert (result == expected).all()
+
+        #Check ret_index
+        pan = web.get_data_yahoo(['GE', 'INTC', 'IBM'], '1977', '1987',
+                                 ret_index=True)
+        tstamp = pan.Ret_Index.INTC.first_valid_index()
+        result = pan.Ret_Index.ix[tstamp]['INTC']
+        expected = 1.0
+        assert result == expected
 
 
 if __name__ == '__main__':

From d4f9c4c97d56c5123b51955a9a3a4821fc7d2e5b Mon Sep 17 00:00:00 2001
From: "Nicholaus E. Halecky" <nehalecky@gmail.com>
Date: Sat, 16 Feb 2013 01:29:49 -0800
Subject: [PATCH 3/3] TST: Included sanity tests in test_yahoo.py

---
 pandas/io/data.py             |  6 +++---
 pandas/io/tests/test_yahoo.py | 13 +++++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/pandas/io/data.py b/pandas/io/data.py
index 06ab8439a6b01..1acf76bc1169d 100644
--- a/pandas/io/data.py
+++ b/pandas/io/data.py
@@ -116,7 +116,7 @@ def get_quote_yahoo(symbols):
         return None
 
     for line in lines:
-        fields = line.decode().strip().split(',')
+        fields = line.decode('utf-8').strip().split(',')
         for i, field in enumerate(fields):
             if field[-2:] == '%"':
                 data[header[i]].append(float(field.strip('"%')))
@@ -253,7 +253,7 @@ def get_components_yahoo(idx_sym):
     #break when no new components are found
     while (True in mask):
         urlStr = url.format(idx_mod, stats,  comp_idx)
-        lines = (urllib.urlopen(urlStr).read().decode().strip().
+        lines = (urllib.urlopen(urlStr).read().decode('utf-8').strip().
                  strip('"').split('"\r\n"'))
 
         lines = [line.strip().split('","') for line in lines]
@@ -271,7 +271,7 @@ def get_components_yahoo(idx_sym):
 
 def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0,
                    adjust_price=False, ret_index=False, chunksize=25,
-                   log_info=False, **kwargs):
+                   **kwargs):
     """
     Returns DataFrame/Panel of historical stock prices from symbols, over date
     range, start to end. To avoid being penalized by Yahoo! Finance servers,
diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py
index 2e339b4839ef3..69994e5e75d9c 100644
--- a/pandas/io/tests/test_yahoo.py
+++ b/pandas/io/tests/test_yahoo.py
@@ -43,12 +43,14 @@ def test_yahoo(self):
                 raise
 
 
+    @slow
     @network
     def test_get_quote(self):
         df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG']))
         assert_series_equal(df.ix[0], df.ix[2])
 
 
+    @slow
     @network
     def test_get_components(self):
 
@@ -70,8 +72,10 @@ def test_get_components(self):
         assert 'AMZN' in df.index
 
 
+    @slow
     @network
     def test_get_data(self):
+        import numpy as np
         #single symbol
         #http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d
         df = web.get_data_yahoo('GOOG')
@@ -88,6 +92,11 @@ def test_get_data(self):
         result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
         assert result == expected
 
+        # sanity checking
+        t= np.array(result)
+        assert     np.issubdtype(t.dtype, np.floating)
+        assert     t.shape == (3,)
+
         expected = [[ 18.99,  28.4 ,  25.18],
                     [ 18.58,  28.31,  25.13],
                     [ 19.03,  28.16,  25.52],
@@ -103,6 +112,10 @@ def test_get_data(self):
         expected = 1.0
         assert result == expected
 
+        # sanity checking
+        t= np.array(result)
+        assert     np.issubdtype(t.dtype, np.floating)
+
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],