Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.21.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ Indexing
I/O
^^^

- Bug in `StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you do class:`~pandas.io.stata.StataReader`. Can you make a bit more clear what is being fixed here.



Plotting
^^^^^^^^

Expand Down
26 changes: 13 additions & 13 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,39 +306,42 @@ def convert_delta_safe(base, deltas, unit):
data_col[bad_locs] = 1.0 # Replace with NaT
dates = dates.astype(np.int64)

if fmt in ["%tc", "tc"]: # Delta ms relative to base
if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
base = stata_epoch
ms = dates
conv_dates = convert_delta_safe(base, ms, 'ms')
elif fmt in ["%tC", "tC"]:
elif fmt.startswith(("%tC", "tC")):
from warnings import warn

warn("Encountered %tC format. Leaving in Stata Internal Format.")
conv_dates = Series(dates, dtype=np.object)
if has_bad_values:
conv_dates[bad_locs] = pd.NaT
return conv_dates
elif fmt in ["%td", "td", "%d", "d"]: # Delta days relative to base
# Delta days relative to base
elif fmt.startswith(("%td", "td", "%d", "d")):
base = stata_epoch
days = dates
conv_dates = convert_delta_safe(base, days, 'd')
elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week
# does not count leap days - 7 days is a week.
# 52nd week may have more than 7 days
elif fmt.startswith(("%tw", "tw")):
year = stata_epoch.year + dates // 52
days = (dates % 52) * 7
conv_dates = convert_year_days_safe(year, days)
elif fmt in ["%tm", "tm"]: # Delta months relative to base
elif fmt.startswith(("%tm", "tm")): # Delta months relative to base
year = stata_epoch.year + dates // 12
month = (dates % 12) + 1
conv_dates = convert_year_month_safe(year, month)
elif fmt in ["%tq", "tq"]: # Delta quarters relative to base
elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base
year = stata_epoch.year + dates // 4
month = (dates % 4) * 3 + 1
conv_dates = convert_year_month_safe(year, month)
elif fmt in ["%th", "th"]: # Delta half-years relative to base
elif fmt.startswith(("%th", "th")): # Delta half-years relative to base
year = stata_epoch.year + dates // 2
month = (dates % 2) * 6 + 1
conv_dates = convert_year_month_safe(year, month)
elif fmt in ["%ty", "ty"]: # Years -- not delta
elif fmt.startswith(("%ty", "ty")): # Years -- not delta
year = dates
month = np.ones_like(dates)
conv_dates = convert_year_month_safe(year, month)
Expand Down Expand Up @@ -1029,10 +1032,6 @@ def _read_header(self):
# calculate size of a data record
self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist)

# remove format details from %td
self.fmtlist = ["%td" if x.startswith("%td") else x
for x in self.fmtlist]

def _read_new_header(self, first_char):
# The first part of the header is common to 117 and 118.
self.path_or_buf.read(27) # stata_dta><header><release>
Expand Down Expand Up @@ -1578,7 +1577,8 @@ def read(self, nrows=None, convert_dates=None,
self._do_convert_missing(data, convert_missing)

if convert_dates:
cols = np.where(lmap(lambda x: x in _date_formats,
cols = np.where(lmap(lambda x: any(x.startswith(fmt)
for fmt in _date_formats),
self.fmtlist))[0]
for i in cols:
col = data.columns[i]
Expand Down
Binary file added pandas/tests/io/data/stata13_dates.dta
Binary file not shown.
21 changes: 21 additions & 0 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ def setup_method(self, method):

self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')

self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')

def read_dta(self, file):
# Legacy default reader configuration
return read_stata(file, convert_dates=True)
Expand Down Expand Up @@ -1327,3 +1329,22 @@ def test_set_index(self):
df.to_stata(path)
reread = pd.read_stata(path, index_col='index')
tm.assert_frame_equal(df, reread)

@pytest.mark.parametrize(
'column', ['ms', 'day', 'week', 'month', 'qtr', 'half', 'yr'])
def test_date_parsing_ignores_format_details(self, column):
# GH 17797
#
# Test that display formats are ignored when determining if a numeric
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: add a newline under the issue reference.

# column is a date value.
#
# All date types are stored as numbers and format associated with the
# column denotes both the type of the date and the display format.
#
# STATA supports 9 date types which each have distinct units. We test 7
# of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
# accounts for leap seconds and %tb relies on STATAs business calendar.
df = read_stata(self.stata_dates)
unformatted = df.loc[0, column]
formatted = df.loc[0, column + "_fmt"]
assert unformatted == formatted
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these supposed to be datetime64[ns] dtype?

what happens for the ignored formats? should raise?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these supposed to be datetime64[ns] dtype?

At this point in the code formatted and unformatted are pandas._libs.tslib.Timestamp objects. Every column in df has a dtype of datetime64[ns]

what happens for the ignored formats? should raise?

Ignored formats are not converted to dates (consistent with previous behavior) source