From ee62dc48c47995695298fddca021fb1bdb2fb133 Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Tue, 15 Oct 2024 13:50:30 +0200 Subject: [PATCH 1/7] Add password with PDF files --- unstructured_inference/inference/layout.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 57fc742a..6f4bc5e5 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -51,6 +51,7 @@ def from_file( filename: str, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, pdf_image_dpi: int = 200, + password:Optional[str] = None, **kwargs, ) -> DocumentLayout: """Creates a DocumentLayout from a pdf file.""" @@ -62,6 +63,7 @@ def from_file( pdf_image_dpi, output_folder=temp_dir, path_only=True, + password=password, ) image_paths = cast(List[str], _image_paths) number_of_pages = len(image_paths) @@ -89,6 +91,7 @@ def from_image_file( detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, fixed_layout: Optional[List[TextRegion]] = None, + password:Optional[str] = None, **kwargs, ) -> DocumentLayout: """Creates a DocumentLayout from an image file.""" @@ -115,6 +118,7 @@ def from_image_file( detection_model=detection_model, element_extraction_model=element_extraction_model, fixed_layout=fixed_layout, + password=password, **kwargs, ) pages.append(page) @@ -133,6 +137,7 @@ def __init__( document_filename: Optional[Union[str, PurePath]] = None, detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, + password:Optional[str] = None, ): if detection_model is not None and element_extraction_model is not None: raise ValueError("Only one of detection_model and extraction_model should be passed.") @@ -148,6 +153,7 @@ def __init__( self.element_extraction_model = element_extraction_model self.elements: Collection[LayoutElement] = [] self.elements_array: LayoutElements | None = None + self.password = password # NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has # locations now and if we need to support LayoutElements without bounding boxes we can make # the bbox property optional @@ -291,6 +297,7 @@ def from_image( detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, fixed_layout: Optional[List[TextRegion]] = None, + password:Optional[str] = None, ): """Creates a PageLayout from an already-loaded PIL Image.""" @@ -299,6 +306,7 @@ def from_image( image=image, detection_model=detection_model, element_extraction_model=element_extraction_model, + password=password, ) # FIXME (yao): refactor the other methods so they all return elements like the third route if page.element_extraction_model is not None: @@ -325,6 +333,7 @@ def from_image( def process_data_with_model( data: BinaryIO, model_name: Optional[str], + password: Optional[str] = None, **kwargs: Any, ) -> DocumentLayout: """Process PDF as file-like object `data` into a `DocumentLayout`. @@ -339,6 +348,7 @@ def process_data_with_model( layout = process_file_with_model( file_path, model_name, + password=password, **kwargs, ) @@ -351,6 +361,7 @@ def process_file_with_model( is_image: bool = False, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, pdf_image_dpi: int = 200, + password: Optional[str] = None, **kwargs: Any, ) -> DocumentLayout: """Processes pdf file with name filename into a DocumentLayout by using a model identified by @@ -370,6 +381,7 @@ def process_file_with_model( filename, detection_model=detection_model, element_extraction_model=element_extraction_model, + password=password, **kwargs, ) if is_image @@ -379,6 +391,7 @@ def process_file_with_model( element_extraction_model=element_extraction_model, fixed_layouts=fixed_layouts, pdf_image_dpi=pdf_image_dpi, + password=password, **kwargs, ) ) @@ -390,6 +403,7 @@ def convert_pdf_to_image( dpi: int = 200, output_folder: Optional[Union[str, PurePath]] = None, path_only: bool = False, + password: Optional[str] = None, ) -> Union[List[Image.Image], List[str]]: """Get the image renderings of the pdf pages using pdf2image""" @@ -402,12 +416,14 @@ def convert_pdf_to_image( dpi=dpi, output_folder=output_folder, paths_only=path_only, + userpw=password, ) else: images = pdf2image.convert_from_path( filename, dpi=dpi, paths_only=path_only, + userpw=password, ) return images From ed43d82eaf752dcf5cc05aaa320ef5430ea12d74 Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Tue, 29 Oct 2024 13:36:54 +0100 Subject: [PATCH 2/7] Add TU --- sample-docs/password.pdf | Bin 0 -> 14177 bytes .../inference/test_layout.py | 19 ++++++++++++++++++ unstructured_inference/inference/layout.py | 4 ---- 3 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 sample-docs/password.pdf diff --git a/sample-docs/password.pdf b/sample-docs/password.pdf new file mode 100644 index 0000000000000000000000000000000000000000..21bd55d500d19531d7ea92e137d223e3797d3642 GIT binary patch literal 14177 zcmeHu2{hH~_qQR@(I7)+9YlsW^Kp*(m?HC#F>!E0hQm2WW-3A{$vmWxDPzdcfJ~W_ z%u1OGWlX4$_j}a6x7+>w-}nFCcipw#^b77jkiH?FceI*I}4JRhv*R89Vt#=1RybjXgIr42qcJxE1p77BiIui2#Sgz zGKECIyMbOlaaZctJeLXY=UVkyrt`j2Mc0{gift@tdh33xSo5kAY+Z`h>}zN6;)TTl zp(ja^*y|6D+Ehf|3DgM?On(+Bf-3P1wDt6U=1P(h=pdHf-cCX@+D!Eo%A19E#3UE{ z;_N`R-Yu?M4)=4OWL!I_Yj6725ylkT$$oKa?9}eOaMnG+s`vAJyEjTcb?dIYFjLoA zJCO_`xI6p^5ExPS1fp(*I)I^|)VDh_nZIlp^IvTEM?zp=7!-j(|FQjTH`D9q%|<%f z%ec%rRYV5#T!uIZ_?Io+CAt*l)=ve_mpTPP_NNarB^i3($gfuZke&B9;=oD$XiePt zh7Xuh9a`gg*K-%R+{BZz#}Ca%G+|Y`-LFe=Mr`aAoW2~;9UWR6wV0rHBu!G0hi8rF zk%|H?&6D(w*kn-z3t@(F=eU(&mv088T$I`9AC3s+^(sDiFj`O{E*_ji@U_XFQIL>(RGJiMyo-KT;XrOXorV5H#~{VA`)|H`kb@<)NgM+ZrF> zO;@i^TuFUfM_VP}^v?U)QS+*I=T%ZaDtAe*_&;}R*7;a+!!r~{<66sj!%=Vk@cDuM z8WVMrmfj*=tGgj;%9dQQRARjz_D$KiTyMnghw$Q}O1 za##b2C^y=ERuv=SAb!A8JjTvCExT%X`wZuj&P*=9h5Pms8K;)Jb+LCt@tBO>SI@3b zd^{VtIaiZ-s(aP$$R|uWyGi~^PtJARSz>joVRcsQ${NM2&nI3N3{D+TFSjnY4vm)ZJbms?D73C|=$+#kJIOp?&Y{E&?4yzFSgMX_SlX z42o?2EZn2ay7lai^x<-rP!>Vnvs(GWWBKvzW|qmsNOmiej@K*Z2P9UH9r$EJ5@}qD zDB^l63U8xKw$M-J?h|fD&1vuD*9u9wq`o_HlcV`U$6-n5&Y4m5o_vO~1|E0|$4R>5 z>w_fMvH6LPwO7gknJsc)rMysb#R0XVRM;WtOXl83?>D8&?jW6+pZLcgZKv70w7haR zBwEqN(1+LJ37^NdR9YCg`F+%$nJgz!xxhpp&{suOwL<0*uGtHo3{sQ!YUQ6iwq1^Z z3Y8_Mx)hPSdfwhaSn6NV>h5cn!!VzKRh#XLu-U*I=X+n8G-Z2pG1O>n z31JkYbN?LFkbTd3RpF_6fsMXP_tY4l$~r+MWql6u3ri;4^DvoK0Q)?$l?!31@T%AH znH5J)>qPA-j^3!OHu0AcMC1i%-J79Dn50n1*vPx; zRKR;g0vR4lROH>gou!xfT2u2u!^OS*eJ|p|jm^j9Pvf>68glueSI(i<*b@AEq1~QZaj>@IJKSSm5P8fr}w`%sxCD zq51kLb)FArJbcz*{d)ZQXKi0?9BXYwt!>5*+ zOjo0uQclUn#YBC0cGE+_ipUkHu@KP_I+PB&HxVn)dDLc;r)8WaeC(ny--Gh5sMwy% zWBNA6@-oc#X)=zQ1x$YU`m7}PUGbpGQ^I2zML!)Tzkuh(D@AQA&XEhJx(>^&Ej<=j zwiD;y{oZ_jXdYfNy1UVq#lSajk`0^qa&U#oi&ouxASWUTeHcBuw@OTGUx!p$m1i=8 z!K3HCI(kx(Tvnna^8rhSq667p^ttqn(0yUbVHpvzkscq;=45mo;S^o|7PZ|!$wD(H zm3-?@2Zefj|aKl)tHE0Vny( zOl>E&HPEeg1c)W<{dO|?TP6K`DZ{JbXVTqUI^e0klF82~zATczuhW=ht>npEHcw9h z+X;NTNx+4H;V|o&79(}s1OYFv%p{`RB5YgqBJRp_KkjDXL@k@5(st;@%S6Fo&hY@P zJ^1R9MTDWNWxC6~-B`D|jkx>Mj~-`PifON0{d7TT|KKO)BL}aojSa1#I_?GUy?7sG zqg2hW|Uvg{?vn>Q&+#IB1kdRG1_%)>YYkM_4 zPiKJaWSUj*YJym?x!+R4GwTb;^u zd$#nO);t!2?AJU5a?ZVCr+;|7r_jPpVov^>;hk?THm|Rl#jD#de$?I8{H*skyFhq2E*DTr9C{O`y05ul%%~^+Rb_La8-gj@3NR958}D-F|&drb#0eHy3#q< zJlbxr2>QH6S>4AB+RL{K1#Q2qVXlx9jQLe^XA9IN7%*Y^V3JhO8p7ep+=*L87qroL}-OQ3)vZBlAmZzj=ru& zd+euNTR!=wA!F2{<8J@7euCTs4AX4~Cii%@2YNN#+Ioi5CZ2s=byX<~M&}&9dNv+$ z++1O$#oxrv3{_|yx@3a+vKn=+@>EQ#{oyKj*kmq5SM%M z{-j5OW>70j^FMrq69!tUzP2z(|nD;^#iN*3ChUXcwBq zUV2lGsiZ(mU6@VDs+nV1Rg+&tg^3ymhxeuMDFSh==Rlh=% z?!Y{!J8?i6In0p1g1YDzJg(-#a4}aeH(oZasJ}Lj{iBFDlWo6~#}NM&H;XsDTtna$ zCei@U@@wz}Z6^|q>OXkFJ!{U~DSaYmmE*=j&5iTKw3({b^WibpbCi(*$XZ2tZ z+G0TPx};1PXd<&t%p*S(oIaV=k<8`eb=~BeBFk;==v$(6q$QUwmro##D`*RpQTKu4 z(@vfegJv3}%K0TZKO5o$@?gh{k3KUcmAYwy3 z9S)zv6w>%K=48f_X>|?+lpKVuYNo}#;h59D7=AqbJ&jTC(1W)pErT;0hv-Kj!aOk_ zgEVQ5B~NL*oaLO$9KhzEE>NW3Q?$2uiuo~@cVnV zMI6b{)Sj}i`cO`)=&)2=R^|IVx&galLe%vy*Vn>d*I6p&2fuixANYBC;iLvy$3Y{t z5rlWZ4Ks9Ki*7rZSskGJ!JLV!NEo@o0Q_T2ROjH{5` zXwWXlk|cEW?%DhCf>Xn1RFLDQ3!Fl17g1=#ToA%tg*(jNHK>$k|WjtIMmBOxYSBT@p>b0q7 ztgK!I+bdcrUrKJexo|2R%^P*w-a0a;^SN8TLB9FM$_t4YqB%H1JFy(LI%@ZA_uN8Bus%KW zsc&C2zZn}l?Gh?&&1_?1xfDI>HE1c?j!<97qGLP%g00vokxRb+@GV?(sGd&bxC4pE ztkF?vsoQ{S&HOZ#Ka(4cIRWZlnY zrS|Ld(WpmCP0xgQ@OlhcQF4R#OiO!Kw%wY9m#=MM>zfo-0*37J?bOf#3Qbn31wG)W zSHus|^azbG#@xG`TJxYWw}*tZAJ!}((S{b zT5ssRxUo&UGFir&>^{=1+^P%lxI!JFt3@AroSe6gA3BRW%aKn1z(m<=Dts=}ZZ>Z0 zSkv8D{*O3&)?AyHPx{-|GcapX`G^;WXM8fY=T$Q&SqxPCebdHIESW!DJn0gZ244+& z;2D@xBv`nbTGV}1%{GYy6LanGCdl)9V>N5aqoiYH)Px64z7I4yi0F;(6N^b=dv+h)W0JhU+Sc8@hkoDF zLNs2-7-Jkyn<_r-QsZ^nmEG(S&)w&?U3dNXc4NS17PlFRIZk&{_P$Skf1e~o=;2E2 zVr`f`j);9@5bJwN;en*c@w$n|r<3iMj|Q<>Ns9SRyeJ#bZclXg&)Yj9;!tSGCDB`URFsO3m)xj5w@$p;(z5rQYs{J z&_R=LZ=;!y!dnf?YT3*!+Ogc18l&g6XA;YgO?N*2C_F{7UEjvCyPwa>oA{g*_b6-^ zm+`_&rL*twx|Mf37ncub!p!XlWjAiPmbiljBx%V@RgibSUNMoxfWaQZW0qah;>Dvf z-)7wWMc4H0ltPsJVLY=ulX)%27d2jS9jpY7=_U=`-2ri6lwYEWy5SM>4Su%Hga&UOE>?ncJA)vVj zGwf|3?QHubZ?Wl2~-nHo3zDCB13~tD|1H2wKWe?NOKEUmp z>Golbj^dv&yv}A7HM?4zhy0KYPr$k<$*YCmEtuNwKoAD1RC; zQrnN=xt)9R(%LZ8V%7?gm~Y{qD1-K3c(1+S&4OcD?M>6RXm_NQj`0%dGC8B z*xHd#jeK5Xm)j1G!*~x&)r89nuk3eJGNt{J+TuFIYut(rgfqIIEJt+V;Hj!{ztGqVGKs+L(;+0v*7rjpSDe&Sq~ix%e{gxok`+$USYa)`#G` zuq)oF%Xr|u#OhM}Le6xA^P2$|!Kc@ngI?Yos`!?D>gc>kp$n&PVtSBTH8CkJI2$6l5g?HJt*_`U{`ZsI6PX}U4pGk zlsesgrt6!CD4r7kc-wdQytH3yoxWtcZ#4w7s29hv^<3-f;~%41 zzBF<9lu@ z_>#Laa$Lph6@RXndwM>PXYJ5}=9+9_NNOOLsXbG?21knrA=%;^t+Z(sX{A*^)2s2V zQEt-s_5NhTS-E}4zLS!M*^D-QY40D)kRf3t*TXFWiYl(?Nd4u9G7)T3)<@fxigfax zP&{9%-_C5l+3*#`K-@CQ^UVmc-5*`ynt!oU>q@DqK-Hr0vD5>r6(Bq-?rNH@l#)fk zQYGOS=}>-AQX02wz1Hp39OjPEY-{8(t0%#w6)tJY_JJ#)9y|WQWUFz;DWxmZ+7ipJ z6VEQ!R~>O)k+{`|3!&5;V`@LnKAL5mfdSB`0i_@Zm9QqHO0+lVG5(#J+9PvA;fc=A1F!A*uKs1Q%6g2|bp5*L7Ap$H4FxA7m5g;l! z>LaP+Y)2sBDb7T9V*<(f42VjzIFi9gU;t>Us1SXuq)b@nTSK%c zcvok8oV%kd0SpB}aAbReI|Yov!axwJGwMVN4nu<=s(24gg0rI&1y~2XG^P;TOu-mn z3DrEn>=58!Dh*-;0!WSDco7)%k3IZgg5Vg;|CR?@Fmbn-wBV^axkta4V&qU%;`lY! z0j9anVio%!*mg}LY3Xn?=W%Drnf-5gGxi3T3HaN;B|>jpAlf7a!mo^x*xLmq+?uJ&gqm z5+eD|>u6B3({a-+cc~d2SN@A#BOF>Ao?RhR=X1#4K!+bH&8Cjbs+hsj>qhX*U1lxo z>n)9`-%6rt{VMDONQIl{guhfpjy83&?{_*9b~Ey5gQe)zc%e>)sC>QZAsh){t`6I=A*7T-aKJ;*u$DW)0)h#}A$9vlfj#vtD&~+KGFs`&v_Q zU5HNB=T>dGw?2mdp?hGkKeGILr$CIo>?qW>F(G*ocBCqJGJ#sF|GHtENo0zu6P`qE z1Rea3F%0M+GiL{i6WIz1MxoGPxQxus4-#l5I0_1ufx~y^C@AnA_@T-X80^j%fk5p1 zwf^2Ff3z={+TBJVhzaqeyEByu1PHOebk9#bq6Wa2QMdW$_WF&Blb8Q)AL#4vW(`P0 zdt(B{3IdRm5EFtA#rjWe2`EvI8^BNQWX1sRNPrmNNz`UXQw>0j2xOuc$(}$4!@g6O zx&#MjJoOAwqeNj*(!eRgB4nUY8H@}Pj6uSrF&L~28X+TtMZq!F5EHx|8DdO5CL}L= z%6HpjYF<@=3IKk{;GIZ+J}byyqS7T$@D6wi{`X3n>+1Z*r=p-Ts6WU2$`XDp}HQb1Se-rKN7*%PtU~O&&3|=Af%)SlJ}AIar1Bk zynub&T;0jCJ_{Ller3PMg4iia!&;_dA%?TwHo zk{ls0EEWra!Xa?D6rdqR_I0P=eWcvU!c-IAZTwby;DQ2EZQ|{SUK9l(p&y1l9L{** zsUiPl9PjMxwqt(hCB&T!ady}V3NZOYU)BRpqA0@TA%B`veL|>CescPM5yY5-@i2#3OvQZT3#9BBfB%fd0T2)HB^E(?YJwEVl~@2ho*4$fzM|2wOH()?rf zk2CP!TK}`|f04=WT>-Q+(EgC$`=8nx|1+L{H}d~m9sjM!zZLn1GVpI}{X>cU+h+f* z$lp%gKeX`Qiu^-~{o7{$t;pX_-9NPO--`T0iT&GV|E06`4ER3s6=Rt+4%-yu^7 z4h%q@`T(Zu3t(;#16RB~!3_W*Au6s!dlxD)3eg6D%|BtJzoLs$rmL1b}tJQGl`E z;LaaXz`yz{)rO%@h#EsZD9(A z{|1i3;24Mz7y;22Q;w8*yeySR9{d=3K%V!89~~C|y07(+E@sL-;p96kAe3iBn=95~GW1zV*=j@`wXH4uGC;%ukW{kO*gh-hx9x(BB_08ihijz-PceWHJaO z&>GYS?Eae!3WWiC_(cXs0;Iy9WB@&f`c)4GMWucAArU~flGu49NF*WBt+fap5*HR%z?dAZ!PMv0mG;BI^1Y Gp#KG?mYu@@ literal 0 HcmV?d00001 diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index d7fc278c..6f844433 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -302,6 +302,25 @@ def mock_get_elements(self, *args, **kwargs): assert page.image is None +@pytest.mark.slow() +def test_from_file_with_password(monkeypatch, mock_final_layout): + + doc = layout.DocumentLayout.from_file( + "sample-docs/password.pdf", + password="password") + assert doc + + monkeypatch.setattr(layout, "get_model", + lambda x: MockLayoutModel(mock_final_layout)) + with patch( + "unstructured_inference.inference.layout.UnstructuredObjectDetectionModel", + MockLayoutModel, + ), open("sample-docs/password.pdf",mode="rb") as fp: + doc = layout.process_data_with_model(fp, model_name="fake", password="password") + assert doc + + + def test_from_image_file_raises_with_empty_fn(): with pytest.raises(FileNotFoundError): layout.DocumentLayout.from_image_file("") diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 6f4bc5e5..473dda07 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -91,7 +91,6 @@ def from_image_file( detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, fixed_layout: Optional[List[TextRegion]] = None, - password:Optional[str] = None, **kwargs, ) -> DocumentLayout: """Creates a DocumentLayout from an image file.""" @@ -118,7 +117,6 @@ def from_image_file( detection_model=detection_model, element_extraction_model=element_extraction_model, fixed_layout=fixed_layout, - password=password, **kwargs, ) pages.append(page) @@ -297,7 +295,6 @@ def from_image( detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, fixed_layout: Optional[List[TextRegion]] = None, - password:Optional[str] = None, ): """Creates a PageLayout from an already-loaded PIL Image.""" @@ -306,7 +303,6 @@ def from_image( image=image, detection_model=detection_model, element_extraction_model=element_extraction_model, - password=password, ) # FIXME (yao): refactor the other methods so they all return elements like the third route if page.element_extraction_model is not None: From ce13fa1f99c49df205821deada17b59797a8312f Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Tue, 10 Dec 2024 09:00:35 +0100 Subject: [PATCH 3/7] merge with main --- test_unstructured_inference/inference/test_layout.py | 1 + unstructured_inference/inference/layout.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index 6f844433..2830e46f 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -563,6 +563,7 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m detection_model=detection_model, element_extraction_model=element_extraction_model, fixed_layouts=None, + password=None, pdf_image_dpi=200, ) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 473dda07..cd6ef158 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -377,7 +377,6 @@ def process_file_with_model( filename, detection_model=detection_model, element_extraction_model=element_extraction_model, - password=password, **kwargs, ) if is_image From 9e32dd6a436e7b6b39645048ac0f80b0037153fc Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Sat, 18 Jan 2025 16:23:12 -0800 Subject: [PATCH 4/7] make tidy --- test_unstructured_inference/inference/test_layout.py | 10 +++------- unstructured_inference/inference/layout.py | 4 ++-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index 2830e46f..be0e8769 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -305,22 +305,18 @@ def mock_get_elements(self, *args, **kwargs): @pytest.mark.slow() def test_from_file_with_password(monkeypatch, mock_final_layout): - doc = layout.DocumentLayout.from_file( - "sample-docs/password.pdf", - password="password") + doc = layout.DocumentLayout.from_file("sample-docs/password.pdf", password="password") assert doc - monkeypatch.setattr(layout, "get_model", - lambda x: MockLayoutModel(mock_final_layout)) + monkeypatch.setattr(layout, "get_model", lambda x: MockLayoutModel(mock_final_layout)) with patch( "unstructured_inference.inference.layout.UnstructuredObjectDetectionModel", MockLayoutModel, - ), open("sample-docs/password.pdf",mode="rb") as fp: + ), open("sample-docs/password.pdf", mode="rb") as fp: doc = layout.process_data_with_model(fp, model_name="fake", password="password") assert doc - def test_from_image_file_raises_with_empty_fn(): with pytest.raises(FileNotFoundError): layout.DocumentLayout.from_image_file("") diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index cd6ef158..7c910298 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -51,7 +51,7 @@ def from_file( filename: str, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, pdf_image_dpi: int = 200, - password:Optional[str] = None, + password: Optional[str] = None, **kwargs, ) -> DocumentLayout: """Creates a DocumentLayout from a pdf file.""" @@ -135,7 +135,7 @@ def __init__( document_filename: Optional[Union[str, PurePath]] = None, detection_model: Optional[UnstructuredObjectDetectionModel] = None, element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, - password:Optional[str] = None, + password: Optional[str] = None, ): if detection_model is not None and element_extraction_model is not None: raise ValueError("Only one of detection_model and extraction_model should be passed.") From 5750785ef4251be7a2fa08faf364b21e02955163 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Sat, 18 Jan 2025 16:25:45 -0800 Subject: [PATCH 5/7] update changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8595d70a..54a3b7d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.8.4-dev0 + +* feat: **Use password** to load PDF with all modes + ## 0.8.3 * fix: removed `layoutelement.from_lp_textblock()` and related tests as it's not used From ceaecc0977ff4ac95514517bf6dc559cb8bb3f62 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Sat, 18 Jan 2025 16:34:16 -0800 Subject: [PATCH 6/7] ignore type --- unstructured_inference/inference/layout.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 7c910298..cee64569 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -411,14 +411,14 @@ def convert_pdf_to_image( dpi=dpi, output_folder=output_folder, paths_only=path_only, - userpw=password, + userpw=password, # type: ignore ) else: images = pdf2image.convert_from_path( filename, dpi=dpi, paths_only=path_only, - userpw=password, + userpw=password, # type: ignore ) return images From 116184f1438a453c27738df32d3a8382953dba1c Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Sat, 18 Jan 2025 16:39:49 -0800 Subject: [PATCH 7/7] update changelog and version --- unstructured_inference/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 54090120..d9ead59a 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.8.3" # pragma: no cover +__version__ = "0.8.4-dev0" # pragma: no cover