@@ -1122,6 +1122,7 @@ def __init__(
11221122 # State variables for the file
11231123 self ._close_file : Callable [[], None ] | None = None
11241124 self ._column_selector_set = False
1125+ self ._value_label_dict : dict [str , dict [int , str ]] = {}
11251126 self ._value_labels_read = False
11261127 self ._dtype : np .dtype | None = None
11271128 self ._lines_read = 0
@@ -1502,36 +1503,26 @@ def _decode(self, s: bytes) -> str:
15021503 )
15031504 return s .decode ("latin-1" )
15041505
1505- def _read_value_labels (self ) -> None :
1506- self ._ensure_open ()
1507- if self ._value_labels_read :
1508- # Don't read twice
1509- return
1510- if self ._format_version <= 108 :
1511- # Value labels are not supported in version 108 and earlier.
1512- self ._value_labels_read = True
1513- self ._value_label_dict : dict [str , dict [float , str ]] = {}
1514- return
1515-
1506+ def _read_new_value_labels (self ) -> None :
1507+ """Reads value labels with variable length strings (108 and later format)"""
15161508 if self ._format_version >= 117 :
15171509 self ._path_or_buf .seek (self ._seek_value_labels )
15181510 else :
15191511 assert self ._dtype is not None
15201512 offset = self ._nobs * self ._dtype .itemsize
15211513 self ._path_or_buf .seek (self ._data_location + offset )
15221514
1523- self ._value_labels_read = True
1524- self ._value_label_dict = {}
1525-
15261515 while True :
15271516 if self ._format_version >= 117 :
15281517 if self ._path_or_buf .read (5 ) == b"</val" : # <lbl>
15291518 break # end of value label table
15301519
15311520 slength = self ._path_or_buf .read (4 )
15321521 if not slength :
1533- break # end of value label table (format < 117)
1534- if self ._format_version <= 117 :
1522+ break # end of value label table (format < 117), or end-of-file
1523+ if self ._format_version == 108 :
1524+ labname = self ._decode (self ._path_or_buf .read (9 ))
1525+ elif self ._format_version <= 117 :
15351526 labname = self ._decode (self ._path_or_buf .read (33 ))
15361527 else :
15371528 labname = self ._decode (self ._path_or_buf .read (129 ))
@@ -1555,8 +1546,45 @@ def _read_value_labels(self) -> None:
15551546 self ._value_label_dict [labname ][val [i ]] = self ._decode (
15561547 txt [off [i ] : end ]
15571548 )
1549+
15581550 if self ._format_version >= 117 :
15591551 self ._path_or_buf .read (6 ) # </lbl>
1552+
1553+ def _read_old_value_labels (self ) -> None :
1554+ """Reads value labels with fixed-length strings (105 and earlier format)"""
1555+ assert self ._dtype is not None
1556+ offset = self ._nobs * self ._dtype .itemsize
1557+ self ._path_or_buf .seek (self ._data_location + offset )
1558+
1559+ while True :
1560+ if not self ._path_or_buf .read (2 ):
1561+ # end-of-file may have been reached, if so stop here
1562+ break
1563+
1564+ # otherwise back up and read again, taking byteorder into account
1565+ self ._path_or_buf .seek (- 2 , os .SEEK_CUR )
1566+ n = self ._read_uint16 ()
1567+ labname = self ._decode (self ._path_or_buf .read (9 ))
1568+ self ._path_or_buf .read (1 ) # padding
1569+ codes = np .frombuffer (
1570+ self ._path_or_buf .read (2 * n ), dtype = f"{ self ._byteorder } i2" , count = n
1571+ )
1572+ self ._value_label_dict [labname ] = {}
1573+ for i in range (n ):
1574+ self ._value_label_dict [labname ][codes [i ]] = self ._decode (
1575+ self ._path_or_buf .read (8 )
1576+ )
1577+
1578+ def _read_value_labels (self ) -> None :
1579+ self ._ensure_open ()
1580+ if self ._value_labels_read :
1581+ # Don't read twice
1582+ return
1583+
1584+ if self ._format_version >= 108 :
1585+ self ._read_new_value_labels ()
1586+ else :
1587+ self ._read_old_value_labels ()
15601588 self ._value_labels_read = True
15611589
15621590 def _read_strls (self ) -> None :
@@ -1729,7 +1757,7 @@ def read(
17291757 i , _stata_elapsed_date_to_datetime_vec (data .iloc [:, i ], fmt )
17301758 )
17311759
1732- if convert_categoricals and self . _format_version > 108 :
1760+ if convert_categoricals :
17331761 data = self ._do_convert_categoricals (
17341762 data , self ._value_label_dict , self ._lbllist , order_categoricals
17351763 )
@@ -1845,7 +1873,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra
18451873 def _do_convert_categoricals (
18461874 self ,
18471875 data : DataFrame ,
1848- value_label_dict : dict [str , dict [float , str ]],
1876+ value_label_dict : dict [str , dict [int , str ]],
18491877 lbllist : Sequence [str ],
18501878 order_categoricals : bool ,
18511879 ) -> DataFrame :
@@ -1983,7 +2011,7 @@ def variable_labels(self) -> dict[str, str]:
19832011 self ._ensure_open ()
19842012 return dict (zip (self ._varlist , self ._variable_labels ))
19852013
1986- def value_labels (self ) -> dict [str , dict [float , str ]]:
2014+ def value_labels (self ) -> dict [str , dict [int , str ]]:
19872015 """
19882016 Return a nested dict associating each variable name to its value and label.
19892017
0 commit comments