pandas-dev · nehalecky · Feb 15, 2013 · Feb 15, 2013 · Feb 16, 2013
diff --git a/pandas/io/data.py b/pandas/io/data.py
@@ -14,6 +14,7 @@
 from zipfile import ZipFile
 from pandas.util.py3compat import StringIO, BytesIO, bytes_to_str
 
+import pandas as pd
 from pandas import Panel, DataFrame, Series, read_csv, concat
 from pandas.io.parsers import TextParser
 
@@ -115,7 +116,7 @@ def get_quote_yahoo(symbols):
         return None
 
     for line in lines:
-        fields = line.strip().split(',')
+        fields = line.decode('utf-8').strip().split(',')
         for i, field in enumerate(fields):
             if field[-2:] == '%"':
                 data[header[i]].append(float(field.strip('"%')))
@@ -133,7 +134,7 @@ def get_quote_yahoo(symbols):
 
 
 def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3,
-                    pause=0):
+                    pause=0, **kwargs):
     """
     Get historical data for the given name from yahoo.
     Date format is datetime
@@ -195,11 +196,22 @@ def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']):
 
 def _calc_return_index(price_df):
     """
-    Return a returns index from a input price df or series.
+    Return a returns index from a input price df or series. Intial value
+    (typically NaN) is set to 1.
     """
-
-    ret_index =  price_df.pct_change().add(1).cumprod()
-    ret_index.ix[0] = 1
+    df = price_df.pct_change().add(1).cumprod()
+    mask = ~df.ix[1].isnull() & df.ix[0].isnull()
+    df.ix[0][mask] = 1
+
+    #Check for first stock listings after starting date of index in ret_index
+    #If True, find first_valid_index and set previous entry to 1.
+    if(~mask).any:
+        for sym in mask.index[~mask]:
+            tstamp = df[sym].first_valid_index()
+            t_idx = df.index.get_loc(tstamp) - 1
+            df[sym].ix[t_idx] = 1
+
+    ret_index = df
     return ret_index
 
 
@@ -241,7 +253,7 @@ def get_components_yahoo(idx_sym):
     #break when no new components are found
     while (True in mask):
         urlStr = url.format(idx_mod, stats,  comp_idx)
-        lines = (urllib.urlopen(urlStr).read().strip().
+        lines = (urllib.urlopen(urlStr).read().decode('utf-8').strip().
                  strip('"').split('"\r\n"'))
 
         lines = [line.strip().split('","') for line in lines]
@@ -258,16 +270,17 @@ def get_components_yahoo(idx_sym):
 
 
 def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0,
-                   adjust_price=False, ret_index=False, chunksize=25, **kwargs):
+                   adjust_price=False, ret_index=False, chunksize=25,
+                   **kwargs):
     """
     Returns DataFrame/Panel of historical stock prices from symbols, over date
     range, start to end. To avoid being penalized by Yahoo! Finance servers,
     pauses between downloading 'chunks' of symbols can be specified.
 
     Parameters
     ----------
-    symbols : string, list-like object (list, tupel, Series), or DataFrame
-        Single stock symbol (ticker), list-like object of symbols or
+    symbols : string, array-like object (list, tupel, Series), or DataFrame
+        Single stock symbol (ticker), array-like object of symbols or
         DataFrame with index containing stock symbols.
     start : string, (defaults to '1/1/2010')
         Starting date, timestamp. Parses many different kind of date
@@ -290,7 +303,7 @@ def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0,
 
     Returns
     -------
-    hist_data : DataFrame (str) or Panel (list-like object, DataFrame)
+    hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
     """
 
     def dl_mult_symbols(symbols):

diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py
@@ -71,6 +71,7 @@ def test_get_components(self):
         assert 'GOOG' in df.index
         assert 'AMZN' in df.index
 
+
     @slow
     @network
     def test_get_data(self):
@@ -83,8 +84,6 @@ def test_get_data(self):
         sl = ['AAPL', 'AMZN', 'GOOG']
         pan = web.get_data_yahoo(sl, '2012')
         ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG]
-        # the provider results are subject to change, disabled. GH2847
-        # assert result == expected
         assert ts[0].dayofyear == 96
 
         dfi = web.get_components_yahoo('^DJI')
@@ -93,33 +92,29 @@ def test_get_data(self):
         result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
         assert result == expected
 
-        pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12',
-                                 adjust_price=True)
-        expected = [18.38, 27.45, 24.54]
-        result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
-        # the provider results are subject to change, disabled. GH2847
-        # assert result == expected
-
         # sanity checking
         t= np.array(result)
         assert     np.issubdtype(t.dtype, np.floating)
         assert     t.shape == (3,)
 
-        pan = web.get_data_yahoo(dfi, '2011', ret_index=True)
-        d = [[ 1.01757469,  1.01130524,  1.02414183],
-             [ 1.00292912,  1.00770812,  1.01735194],
-             [ 1.00820152,  1.00462487,  1.01320257],
-             [ 1.08025776,  0.99845838,  1.00113165]]
-
-        expected = pd.DataFrame(d)
-        result = pan.Ret_Index.ix['01-18-11':'01-21-11'][['GE', 'INTC', 'MSFT']]
-        # the provider results are subject to change, disabled. GH2847
-        # assert_almost_equal(result.values, expected.values)
+        expected = [[ 18.99,  28.4 ,  25.18],
+                    [ 18.58,  28.31,  25.13],
+                    [ 19.03,  28.16,  25.52],
+                    [ 18.81,  28.82,  25.87]]
+        result = pan.Open.ix['Jan-15-12':'Jan-20-12'][['GE', 'MSFT', 'INTC']].values
+        assert (result == expected).all()
+
+        #Check ret_index
+        pan = web.get_data_yahoo(['GE', 'INTC', 'IBM'], '1977', '1987',
+                                 ret_index=True)
+        tstamp = pan.Ret_Index.INTC.first_valid_index()
+        result = pan.Ret_Index.ix[tstamp]['INTC']
+        expected = 1.0
+        assert result == expected
 
         # sanity checking
         t= np.array(result)
         assert     np.issubdtype(t.dtype, np.floating)
-        assert     t.shape == (4, 3)
 
 
 if __name__ == '__main__':