From f7a483558f44fc49d6e96c25cf73fc3785f6d2fb Mon Sep 17 00:00:00 2001 From: Lucas Leadbetter <5595530+lleadbet@users.noreply.github.com> Date: Thu, 20 Nov 2025 11:07:11 -0500 Subject: [PATCH 1/2] add support for sheet_name in openpyxl_chunk_reader --- .../sample_files/test-with-multiple-sheets.xlsx | Bin 0 -> 7349 bytes .../connectors/source-file/source_file/client.py | 11 +++++++++-- .../source-file/unit_tests/test_client.py | 13 +++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 airbyte-integrations/connectors/source-file/integration_tests/sample_files/test-with-multiple-sheets.xlsx diff --git a/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test-with-multiple-sheets.xlsx b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test-with-multiple-sheets.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..d7b279b1b31db67ad088ee7fde92eaf2336bffd6 GIT binary patch literal 7349 zcmeHMWmHvNw?1@tNr!+mNJ~n0cL^L0(s__BX$~MEA&qo5NTW0&-5}j1prq72@_t_5 z&)4t%yT9(;W9+@hK5NW*)|zw8`8;b*RRuVBd;kD|485rVeKDBK<}d)j0|Wp75BiIN zgoC|{xxI_AriY_B*oe*D&bBLlNx6dqzvocNJ2dZI2tfl!4@bk%a9&&(mg+Ru)JUDu zFwDw ziAl#i_&AK%++^{Jo&;l7Q_52v!>q3^;ZTHavBh{jR7!xsM=Ke(mOTmMQH)tw7WWij zVn!&+BPVhm>~;x0=Kk(aW){?rw-2^1@p>6d758#D7gN9)O<9DqR%t5BZ>Lp3P%4lU zBao|i(Ul8@Qoc3N>_UaC%a6G4L`WDy3tnHAB}H@p5-l7~0WQ~qzY&i;%&CF%o?Ha? zxl}}nLT!F`2M18S_X^^_yi%vd@$dvXq6iBBpaGC!jh)SH!R%~5o>bZJunbuM=)nJ9 z|CcRL8mFYv$$=hnD0>~+cRslggDtM$CMMfVt?BD8_Zh1$Du}8jxTHX^>C`+ zbYlJvhG^Im#r?5lf<7?NVz%77&zAc1l0-|x@`+U?FvD4Z#={t7`~I6`YCGPIH>+}5 zpE?aU?gi(tOfOy5sh@r=_whiwM;9MTXGz~te?SCf;UY9a{R)zwsY|{n-}F0l3&_9$ z03^`2yDj@)apLCSYy)&~u=$a^?qjAUZqT}m13l<4xGiwB)hX^)^`n3nV{mKzJQ7D0 zZm5#2S&^v6RZ9F9-pbshksMm?Z;wAczj|`AO8)T0A|pdJ8GO@R>QEi(vvU=lbE>d9 zuBLNLf=JoQGB$SJ;Taoc6;Sh2nj(I6ljP38cufN`ZgY_=g`}QdF))c4b6`1Ela$`( zWqM>gPgaT*BwY>LZIh(S7rp<|!ETA$C@Q+mmvrQq7_C1X?RC~G*6IMO>t}>|um~O* zZw+6sboRpONs2g;>ChG^G{S3KbA1AGzO(6wnZy}H>(vwy$m23#rvuk8p-21dey?KY z9uz%u*1p?C`;*0D-ojz_wjm`QlfHe>XKNc ziZHulriA1|?T1%;EMjo;#XKdP+I^7dee#L}6~6X6*Vfow59*B?1b~)sHMndVrsqY< zx420i=RxL5mOQ59khl6Po@M+4yJ=t}!UmbJXa#yEuW0N-1&(!RMsXXiuBGRK zoQZ&7F@8~8o#8ICe~;Ig#jQpFqYqKQfd@L9gmcv@d&l}wpZp~yXoIrqnvX-?mpCe* zR=*4$y9F!TL&jeCYSWVu^|m3YynAYIxRg7Y$+pC#?iJTt58mo4j(E~rdvTt;jqo@I8n-$R@k5*8aADoD%&p!N zp2CecP8I?|YZW{fp?d*0gG7_cCiD})0*~p3FzedZ?0#twk0-kS2s6b!Bf6uH&XVl(s!)_-*Qnxh(|F|^@Ah(2`*J^piwo*IB`1oK7?#3x0+NLKuseq=MRn#Q zi=eg@ElGkxrq1LvlRrn=j@*=cog+u`maWKv1Sn@k-rhQW!D7>XQu%_cH&`yw`nbcw z7RHI=#4BX2&zCL~NsKZ%Bnv*He#e%T&s)5lwdHB`L0NGtixS<*Gd$G|B~yC|djlqp zt-|?85-b%rP4!{FCbZoZCmpXGM>ms?kP|Vze4CZf_!C6LB!^R@vU!Siw)c;B1@#MFGshqtCTvicXJSr}{zT_Mf>hSAG7$5|k6Of z#lzMd{38dJY8g1Bay;L4is2Z=yw5ZgJwB?~GP zHTJ!x4eA-nvnN~4afY?(Su)Q_%C#VYT$B&SRV1r*nI(@!-1s!9;PsVT5{3b2#bZpdX2zon=)cA{gdR8E6bdZ5QBE+D8)lse8i z*ej@!Wf7L9=(4S>ReT^tzil;O-Y7IUmSvOACJ)01C1ws0hgFqSQ87%v=Fv{6iaRZP zB$S#C1Cq9jR}s9J+@zle5z5d$^`%tD#uF7fZsj?YgoK$!f6l|QkjqQ3Q+@mHV?I9_ zy589J(`e17o<54zxg+AFps5o4t;VIBG(ENc?`BCM8Y_xMOh`HF*t!<9uW}1KDQ3lQ zsjokvWf3G8?tA7Rsj!gi!Py#yY4pC2p;LuPBu?l=#Mlcp!dORz)Dg|6gQtz3wDP(T z1s!k0?jYBI@{~{ptFWv9)CWp>G`i;0WFZxNU#wnW+%em+;;SacrNp4nfqusm&(~a0%4oDqHEw_@oMp1JL@o4ZX(+ZRYqiTNS_!WYDY)1=T^+aeNDp0{Fjj2cPU~J|dB)Pp z-=xoybomMEro=cW`8yU~-4*=PSj5p591gd^<$bE#TT6vw%khCjPeM?1-Ec>^A2Cu} zQs&i5BBkw#ROKNv_UJ&zC*g$CP889#y@zQnA}9gAA%e%ppPl>Rb*>B#xVY=Z<4mp~ zc%V!a=j*oT?f=FKep2V>WN8mGp%@pA`PYF4$_iX8&F#$pdOrRU51(}U?dLhLTZy(s zk(>DHfdf@h{9>zJ+i#^_Hc1UfO0uFzq0rXc_=HNZ=F;&EB6dBpy5{$;Il_41{auyg z@D*xulJ^D@lG##@5Yt*(0rTB?#b8q7p`4VvqHh1rN59-LQQ^z$oi{96(rCLMJlT>! z8y50f1EaFiCiGvkN*O}$;?(gaKa|MIR8gHTzcb0{Ja*@0k{$AiehvHCo=ZOAM1|yC z61q2-u8tZAEJ+^|;cY|V7KfZz4+go(%zUI=8>UPri|(sNtK@LYUq9=X@MgqN+ z7DNmoMh$9rvT;J$?qhLjRb`%_lhm&F=KWvY;iOBEDH_i-wTOkL=$ssjs2Y0k^1i<= zu}zIdjU!yfEGjiB7%P!S3B?;M#Bu56%fGF^c-FH?hV)XTkNz;Oa7YBxKoDP^_+-D3 zR~%OVoB@UrA}jH}gV#7ERsVU2U<-JEwa+OU@DUM9uA~&Q9=V6Q2-yx>em>VXZ`mHD6ND zWC(G%)k6AN{H0E(@Vlvq@?1>ebjT*R3-_zdLp2*#8o{~u4Q2R|UgykyyeKP8*s<0d zkEl|PbTvl=;|khjbWzX4Wl_M(#@Q?hF}neMpJ>UXdJDy3nVEzViI|N;mt5i23aR^8 z`QgPbD%T0Am?@*k@E2G)m2gQIoA(nD3aDY9u0(00-$o1=V@@#-7%-gkMjW;>yE6OY zT2>t~jG#nAF9n>* z7xi%Txld-q@u`j^Ccos>&9$ zNJ^P3f4OcvoZVXeQb^zdvqzf}m;WsYmjjmB%9ZZs_{Q`s9Cxd+s6iZ#PWWYxTWT~V z1a7X?1|Wn2za~HDzO)(nWYmAt{uS$+<)TsRFP#+&kEivQwM6BZ6Y2Uf#tlgXu${dy z;P&)UFCf|DCf9(ftOl&_Mh^}6#%dc73mO;P4ivNO;egBuCj}Jp;e?lKLs=UJ1iK>n z3T%C&A%bYbK*jNSc1|ls7JiDG4mT8(=oc5BbqhD47J%S+PsqR zgmlvgw+?%HT|>51w`!)H%=OchU^kM`b)m?%b{CBkpRXW31lMPBa4Lxn%Apv73xORH zauOwNBpDm!$D{9g5mGm`IBR7M34w3fcDA_NjeEjGmJ5Ocy}l^{MICU?M~@%4(5t~R zU{}?IUEPS9_2HWJMmoI2@)<|^Krqpwaf?=LT>Vvv-(>chS7|C@o`5`k*!==g0%pgW zf;<~caSB3Dxx3HT_fC!<{eTKfjg7dhgF{JWus|0k#~^S*LNn%C7)3t+tX<7aDkt!r z>4je#4vfs!^{ji2X~v?h$k8hNzjB ze0{}CysWJVV-?Na8I(WW-QCZzno(9!>F6#SIi5*d1Ldu8!YoR2V-hosVb`|jXSb8Kh|GyBa`O{spez3aY3q-e zvM+^>Ttn|46*(wWfh|p)%|RM2&Q|so_uLIMB!=3-fsHWIx=h}&Z>KMYBd4Kx1n=oT z@``sv(b|H^)9Yb?xX~@2_wAcq9Dx_TFMV=xAuLG2n!P>abwS^D8fN@~P7@7mxQR!S zAs>eq6=bsn%zZM)shIa@a~?%(N)j_i0{LBsQ(loV5SA~^=SaC#^h z<3LvjaxhbMc5nojd}|A65%btRQGC=6HB-cf%QexV`a8a_u=6{Zmz!=T?a7#Rwb zwH?U7{d$<(MQ4Ze$UeivAtN)>!uNJ@VwN<-GR-Iq$Vot~?>IMeD*m~w{A)W+L~f~e zT}=ZvxwO->n_C_v;{Y^wytLk@h-N+Vyot+1tI2&y>eB8#(%mRrEP*O)+>i!zn;n9d zVxEt)E}l%Jw0Q)KvuK;Xn)n8C7&*C1L7vznR8>8QtWtW4sEup5A@sh298_MPdQ`VA zl?ZR@|VI~Hp<61KyO;IhxijX%`d{QMZdc>HsMa%3JCT`(6A3_ck6k0 z>r>o$Y`^AzbEqL7^?dKR7m^7(S@{Oycp#p%->HisXcL~anTT&!A0BX(U)Mf63ElF4 za35u=QN$s11Ojy-_TSxS>gagyy_jG1Cx&w#Vp+VDF~v0nLwf$mg`9~48ht_ej)_|@ zbUXy7_2*Z0l=IvKgQ7K;ALEat_cbyCQW;7AC{Q9uaQ3>{pDlBB_n=g+AL(2m}Q7Jis zB-x$e3gDBji>`5pcWiKIe$Mn1Llt!cT6q<9w{u^OxOTU~&!D^apL@H^cznway6btM zRrCJOO_&#e^>W*%m zc3QF$$*U}37C7|dxqF#>n1bJIUa=SnOJIBoh`=$KsNa$=Qt~?bzRek>)oI*GH`5nK}klF{oG_)EFLQ~^td%kR<^rijjf{J&z$Y>v8w@=w0 ztl&jmz!KxEe>G4XmhxirNlF; zF05(&tWv;2kM-ML=KVbGAOAbx0t1VB4)?oVvt zcMZQ|@GlMeP{Q#KME+g;cMSL?PE7bW@qZ!0@6x{~$zRgEM1M5Geaci-K!7s+A8ju* NKo2y<&JzFl`Y-lrv)=#! literal 0 HcmV?d00001 diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index d0b083798575..594d0d31df60 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -517,11 +517,18 @@ def openpyxl_chunk_reader(self, file, **kwargs): skiprows = kwargs.get("skiprows", 0) user_provided_column_names = kwargs.get("names") chunk_size = 500 - + sheet_name = kwargs.get("sheet_name", None) # Load workbook with data-only to avoid loading formulas work_book = load_workbook(filename=file, data_only=True, read_only=True) - for sheetname in work_book.sheetnames: + # Iterate through sheets + # Panda's read_excel allows specifying sheet_name as str, int, list of str/int or None (all sheets) + # For simplicity, we only support str or int for now as we expect users to read one sheet at a time in most cases. + for idx, sheetname in enumerate(work_book.sheetnames): + if sheet_name and sheetname is str and sheetname != sheet_name: + continue + elif sheet_name and sheet_name is int and idx != sheet_name: + continue work_sheet = work_book[sheetname] data = list(work_sheet.iter_rows(values_only=True)) diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py index f56ffd51beba..f2cbfbc1396b 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py @@ -99,6 +99,19 @@ def test_load_dataframes_xlsx(config, absolute_path, test_files, file_name, shou expected = read_excel(f, engine="openpyxl") assert read_file.equals(expected) +@pytest.mark.parametrize("file_name, should_raise_error, sheet_name", [("test-with-multiple-sheets.xlsx", False, "unit_tests"), ("test-with-multiple-sheets.xlsx", False, 0)]) +def test_load_dataframes_xlsx_with_sheets(config, absolute_path, test_files, file_name, should_raise_error, sheet_name): + config["format"] = "excel" + config["reader_options"] = {"sheet_name": sheet_name} + client = Client(**config) + f = f"{absolute_path}/{test_files}/{file_name}" + if should_raise_error: + with pytest.raises(AirbyteTracedException): + next(client.load_dataframes(fp=f)) + else: + read_file = next(client.load_dataframes(fp=f)) + expected = read_excel(f, engine="openpyxl", sheet_name=sheet_name) + assert read_file.equals(expected) @pytest.mark.parametrize("file_format, file_path", [("json", "formats/json/demo.json"), ("jsonl", "formats/jsonl/jsonl_nested.jsonl")]) def test_load_nested_json(client, config, absolute_path, test_files, file_format, file_path): From 548b02b04c3b307cb4d9b8c6aa6731d152963ff0 Mon Sep 17 00:00:00 2001 From: Lucas Leadbetter <5595530+lleadbet@users.noreply.github.com> Date: Thu, 20 Nov 2025 11:28:11 -0500 Subject: [PATCH 2/2] add support for multiple sheets --- .../connectors/source-file/source_file/client.py | 9 ++++++--- .../source-file/unit_tests/test_client.py | 13 ++++++++++++- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index 594d0d31df60..060dca74fe9f 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -523,12 +523,15 @@ def openpyxl_chunk_reader(self, file, **kwargs): # Iterate through sheets # Panda's read_excel allows specifying sheet_name as str, int, list of str/int or None (all sheets) - # For simplicity, we only support str or int for now as we expect users to read one sheet at a time in most cases. for idx, sheetname in enumerate(work_book.sheetnames): - if sheet_name and sheetname is str and sheetname != sheet_name: + # Handle sheet_name filtering + if sheet_name and isinstance(sheet_name, str) and sheetname != sheet_name: continue - elif sheet_name and sheet_name is int and idx != sheet_name: + elif sheet_name and isinstance(sheet_name, int) and idx != sheet_name: continue + elif sheet_name and isinstance(sheet_name, list) and sheetname not in sheet_name and idx not in sheet_name: + continue + work_sheet = work_book[sheetname] data = list(work_sheet.iter_rows(values_only=True)) diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py index f2cbfbc1396b..2e1555b28209 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py @@ -99,7 +99,12 @@ def test_load_dataframes_xlsx(config, absolute_path, test_files, file_name, shou expected = read_excel(f, engine="openpyxl") assert read_file.equals(expected) -@pytest.mark.parametrize("file_name, should_raise_error, sheet_name", [("test-with-multiple-sheets.xlsx", False, "unit_tests"), ("test-with-multiple-sheets.xlsx", False, 0)]) +@pytest.mark.parametrize("file_name, should_raise_error, sheet_name", [ + ("test-with-multiple-sheets.xlsx", False, "unit_tests"), # single sheet by name + ("test-with-multiple-sheets.xlsx", False, 0), # single sheet by index + ("test-with-multiple-sheets.xlsx", False, ["unit_tests"]), # list of sheet names + ("test-with-multiple-sheets.xlsx", False, [0]) # list of sheet indices + ]) def test_load_dataframes_xlsx_with_sheets(config, absolute_path, test_files, file_name, should_raise_error, sheet_name): config["format"] = "excel" config["reader_options"] = {"sheet_name": sheet_name} @@ -111,6 +116,12 @@ def test_load_dataframes_xlsx_with_sheets(config, absolute_path, test_files, fil else: read_file = next(client.load_dataframes(fp=f)) expected = read_excel(f, engine="openpyxl", sheet_name=sheet_name) + print(sheet_name) + if isinstance(sheet_name, list): + print(sheet_name) + print(expected) + expected = expected[sheet_name[0]] + print(expected) assert read_file.equals(expected) @pytest.mark.parametrize("file_format, file_path", [("json", "formats/json/demo.json"), ("jsonl", "formats/jsonl/jsonl_nested.jsonl")])