1313
1414import re
1515from pandas .core .dtypes .missing import notna
16+ from pandas .core .tools .numeric import to_numeric
1617
1718
1819@Appender (_shared_docs ['melt' ] %
@@ -199,6 +200,9 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
199200
200201 .. versionadded:: 0.20.0
201202
203+ .. versionchanged:: 0.22.0
204+ When all suffixes are numeric, they are cast to int64/float64.
205+
202206 Returns
203207 -------
204208 DataFrame
@@ -278,8 +282,8 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
278282
279283 Going from long back to wide just takes some creative use of `unstack`
280284
281- >>> w = l.reset_index().set_index(['famid', 'birth', 'age']). unstack()
282- >>> w.columns = pd.Index( w.columns).str.join('' )
285+ >>> w = l.unstack()
286+ >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format )
283287 >>> w.reset_index()
284288 famid birth ht1 ht2
285289 0 1 1 2.8 3.4
@@ -333,26 +337,76 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
333337 >>> list(stubnames)
334338 ['A(quarterly)', 'B(quarterly)']
335339
340+ All of the above examples have integers as suffixes. It is possible to
341+ have non-integers as suffixes.
342+
343+ >>> df = pd.DataFrame({
344+ ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
345+ ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
346+ ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
347+ ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
348+ ... })
349+ >>> df
350+ birth famid ht_one ht_two
351+ 0 1 1 2.8 3.4
352+ 1 2 1 2.9 3.8
353+ 2 3 1 2.2 2.9
354+ 3 1 2 2.0 3.2
355+ 4 2 2 1.8 2.8
356+ 5 3 2 1.9 2.4
357+ 6 1 3 2.2 3.3
358+ 7 2 3 2.3 3.4
359+ 8 3 3 2.1 2.9
360+
361+ >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
362+ sep='_', suffix='\w')
363+ >>> l
364+ ... # doctest: +NORMALIZE_WHITESPACE
365+ ht
366+ famid birth age
367+ 1 1 one 2.8
368+ two 3.4
369+ 2 one 2.9
370+ two 3.8
371+ 3 one 2.2
372+ two 2.9
373+ 2 1 one 2.0
374+ two 3.2
375+ 2 one 1.8
376+ two 2.8
377+ 3 one 1.9
378+ two 2.4
379+ 3 1 one 2.2
380+ two 3.3
381+ 2 one 2.3
382+ two 3.4
383+ 3 one 2.1
384+ two 2.9
385+
336386 Notes
337387 -----
338388 All extra variables are left untouched. This simply uses
339389 `pandas.melt` under the hood, but is hard-coded to "do the right thing"
340- in a typicaly case.
390+ in a typical case.
341391 """
342392 def get_var_names (df , stub , sep , suffix ):
343- regex = " ^{stub}{sep}{suffix}" .format (
393+ regex = r' ^{stub}{sep}{suffix}$' .format (
344394 stub = re .escape (stub ), sep = re .escape (sep ), suffix = suffix )
345- return df .filter (regex = regex ).columns .tolist ()
395+ pattern = re .compile (regex )
396+ return [col for col in df .columns if pattern .match (col )]
346397
347398 def melt_stub (df , stub , i , j , value_vars , sep ):
348399 newdf = melt (df , id_vars = i , value_vars = value_vars ,
349400 value_name = stub .rstrip (sep ), var_name = j )
350401 newdf [j ] = Categorical (newdf [j ])
351402 newdf [j ] = newdf [j ].str .replace (re .escape (stub + sep ), "" )
352403
404+ # GH17627 Cast numerics suffixes to int/float
405+ newdf [j ] = to_numeric (newdf [j ], errors = 'ignore' )
406+
353407 return newdf .set_index (i + [j ])
354408
355- if any (map ( lambda s : s in df .columns . tolist (), stubnames ) ):
409+ if any ([ col in stubnames for col in df .columns ] ):
356410 raise ValueError ("stubname can't be identical to a column name" )
357411
358412 if not is_list_like (stubnames ):
@@ -368,8 +422,7 @@ def melt_stub(df, stub, i, j, value_vars, sep):
368422 if df [i ].duplicated ().any ():
369423 raise ValueError ("the id variables need to uniquely identify each row" )
370424
371- value_vars = list (map (lambda stub :
372- get_var_names (df , stub , sep , suffix ), stubnames ))
425+ value_vars = [get_var_names (df , stub , sep , suffix ) for stub in stubnames ]
373426
374427 value_vars_flattened = [e for sublist in value_vars for e in sublist ]
375428 id_vars = list (set (df .columns .tolist ()).difference (value_vars_flattened ))
0 commit comments