Skip to content

Commit e8128b9

Browse files
Merge branch 'dev'
2 parents d9fd0df + 2e2bd0a commit e8128b9

File tree

19 files changed

+430
-287
lines changed

19 files changed

+430
-287
lines changed

DocToolsLLM/DocToolsLLM.py

Lines changed: 130 additions & 79 deletions
Large diffs are not rendered by default.

DocToolsLLM/__init__.py

Lines changed: 52 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,53 +8,73 @@
88
from rich.markdown import Markdown
99
from rich.console import Console
1010

11-
from .DocToolsLLM import DocToolsLLM_class
11+
from .DocToolsLLM import DocToolsLLM_class as DocToolsLLM
12+
13+
__all__ = [
14+
"DocToolsLLM",
15+
"cli_launcher",
16+
"utils",
17+
]
18+
19+
__VERSION__ = DocToolsLLM.VERSION
1220

1321

1422
def fire_wrapper(
15-
h: bool = False,
16-
help: bool = False,
1723
*args,
1824
**kwargs,
19-
) -> Tuple[List, dict]:
25+
) -> dict:
2026
"used to catch --help arg to display it better then fire does on its own"
21-
assert "h" not in args and "h" not in kwargs
22-
assert "help" not in args and "help" not in kwargs
23-
24-
if (h in ["h", "help", True] or help in ["h", "help", True]): # --help or similar intentions
25-
return [], {"help": True}
2627

27-
if not (h or help or args or kwargs):
28-
return [], {"help": True}
28+
# --help or similar
29+
if ("help" in args and len(args) == 1) or ("help" in kwargs and kwargs["help"]):
30+
print("Showing help")
31+
md = Markdown(DocToolsLLM.__doc__)
32+
console = Console()
33+
console.print(md, style=None)
34+
raise SystemExit()
2935

30-
# parse args as if nothing happened
31-
args = list(args)
32-
if h:
33-
args.insert(0, h)
34-
if help:
35-
args.insert(1, help)
36+
# no args given
37+
if not any([args, kwargs]):
38+
print("Empty arguments, showing help")
39+
md = Markdown(DocToolsLLM.__doc__)
40+
console = Console()
41+
console.print(md, style=None)
42+
raise SystemExit()
3643

3744
# while we're at it, make it so that
3845
# "DocToolsLLM summary" is parsed like "DocToolsLLM --task=summary"
46+
args = list(args)
3947
if args and isinstance(args[0], str):
40-
args[0] = args[0].replace("summary", "summarize")
41-
if args[0] in ["query", "search", "summarize", "summarize_then_query"]:
42-
assert "task" not in kwargs, f"Tried to give task as arg and kwarg?\nargs: {args}\bnkwargs: {kwargs}"
43-
kwargs["task"] = args.pop(0)
44-
return args, kwargs
48+
if args[0].replace("summary", "summarize") in ["query", "search", "summarize", "summarize_then_query"]:
49+
assert "task" not in kwargs or not kwargs["task"], f"Tried to give task as arg and kwarg?\n- args: {args}\n- kwargs: {kwargs}"
50+
kwargs["task"] = args.pop(0).replace("summary", "summarize")
51+
52+
# prepare the parsing of --query
53+
if "query" not in kwargs:
54+
kwargs["query"] = None
55+
if kwargs["query"] in [True, None, False]:
56+
kwargs["query"] = ""
57+
else:
58+
kwargs["query"] = str(kwargs["query"])
59+
60+
# any remaining args is put in --query
61+
if args:
62+
if not kwargs["query"]:
63+
kwargs["query"] = " ".join(map(str, args))
64+
else:
65+
kwargs["query"] += " " + " ".join(map(str, args))
66+
args = []
67+
68+
kwargs["query"] = kwargs["query"].replace("summary", "summarize")
69+
70+
assert not args
71+
return kwargs
4572

4673

4774
def cli_launcher() -> None:
4875
sys_args = sys.argv
4976
if "--completion" in sys_args:
50-
return fire.Fire(DocToolsLLM_class)
77+
return fire.Fire(DocToolsLLM)
5178

52-
args, kwargs = fire.Fire(fire_wrapper)
53-
54-
if "help" in kwargs:
55-
md = Markdown(DocToolsLLM_class.__doc__)
56-
console = Console()
57-
console.print(md, style=None)
58-
raise SystemExit()
59-
else:
60-
instance = DocToolsLLM_class(*args, **kwargs)
79+
kwargs = fire.Fire(fire_wrapper)
80+
instance = DocToolsLLM(**kwargs)

DocToolsLLM/docs/USAGE.md

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99

1010
* `--filetype`: str, default `infer`
1111
* the type of input. Depending on the value, different other parameters
12-
are needed. If json_list is used, the line of the input file can contain
12+
are needed. If json_entries is used, the line of the input file can contain
1313
any of those parameters as long as they are as json. You can find
14-
an example of json_list file in `DocToolsLLM/docs/json_list_example.txt`
14+
an example of json_entries file in `DocToolsLLM/docs/json_entries_example.txt`
1515

1616
* Supported values:
1717
* `infer`: will guess the appropriate filetype based on `--path`.
@@ -28,10 +28,10 @@
2828
you must type or paste the string
2929
* `local_audio`: must be set: `--whisper_prompt`, `--whisper_lang`. The model used will be `whisper-1`
3030

31-
* `json_list`: `--path` is path to a txt file that contains a json
31+
* `json_entries`: `--path` is path to a txt file that contains a json
3232
for each line containing at least a filetype and a path key/value
3333
but can contain any parameters described here
34-
* `recursive`: `--path` is the starting path `--pattern` is the globbing
34+
* `recursive_paths`: `--path` is the starting path `--pattern` is the globbing
3535
patterns to append `--exclude` and `--include` can be a list of regex
3636
applying to found paths (include is run first then exclude, if the
3737
pattern is only lowercase it will be case insensitive) `--recursed_filetype`
@@ -47,7 +47,7 @@
4747

4848
* `--modelname`: str, default `"openai/gpt-4o"`
4949
* Keep in mind that given that the default backend used is litellm
50-
the part of modelname before the slash (/) is the server name.
50+
the part of modelname before the slash (/) is the backend name (also called provider).
5151
If the backend is 'testing/' then a fake LLM will be used
5252
for debugging purposes.
5353
If the value is not part of the model list of litellm, will use
@@ -94,7 +94,7 @@
9494
---
9595

9696
* `--query`: str, default `None`
97-
* if str, will be directly used for the first query if task in `["query", "search"]`
97+
* if str, will be directly used for the first query if task in `["query", "search", "summarize_then_query"]`
9898

9999
* `--query_retrievers`: str, default `"default"`
100100
* must be a string that specifies which retriever will be used for
@@ -164,6 +164,7 @@
164164
* `--debug`: bool, default `False`
165165
* if True will enable langchain tracing, increase verbosity,
166166
disable multithreading for summaries and loading files,
167+
crash if an error is encountered when loading a file,
167168
automatically trigger the debugger on exceptions.
168169

169170
* `--dollar_limit`: int, default `5`
@@ -182,7 +183,7 @@
182183
* if True, will remember the messages across a given chat exchange.
183184
Disabled if using a testing model.
184185

185-
* `--no_llm_cache`: bool, default `False`
186+
* `--disable_llm_cache`: bool, default `False`
186187
* WARNING: The cache is temporarily ignored in non openaillms
187188
generations because of an error with langchain's ChatLiteLLM.
188189
Basically if you don't use `--private` and use llm form openai,
@@ -227,7 +228,7 @@
227228

228229
# Loader specific arguments
229230
Those arguments can be set at cli time but can also be used
230-
when using recursive filetype combination to have arguments specific
231+
when using recursive_paths filetype combination to have arguments specific
231232
to a loader. They apply depending on the value of `--filetype`.
232233
An unexpected argument for a given filetype will result in a crash.
233234

@@ -293,8 +294,8 @@
293294
the audio from the youtube link, and deepgram will be used to turn the audio into text. `--deepgram_kwargs` will be used if set.
294295

295296
* `--include`: str
296-
* Only active if `--filetype` is one of json_list, recursive,
297-
link_file, youtube_playlist.
297+
* Only active if `--filetype` is one of 'json_entries', 'recursive_paths',
298+
'link_file', 'youtube_playlist'.
298299
`--include` can be a list of regex that must be present in the
299300
document PATH (not content!)
300301
`--exclude` can be a list of regex that if present in the PATH
@@ -387,7 +388,7 @@
387388
* `--loading_failure`: str, default `crash`
388389
* either `crash` or `warn`. Determines what to do with
389390
exceptions happening when loading a document. This can be set
390-
per document if a recursive filetype is used.
391+
per document if a recursive_paths filetype is used.
391392

392393
# Runtime flags
393394

DocToolsLLM/docs/json_list_example.txt renamed to DocToolsLLM/docs/json_entries_example.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77

88
# this will fetch all the pdf recursively inside a dir according to some parameters
9-
{"path": "../some/path/to/parent/", "filetype": "recursive", "recursed_filetype": "pdf", "pattern": "*pdf", "exclude": ["regex_to_exclude"], "include": ["regex_that_need_to_be_present"]}
9+
{"path": "../some/path/to/parent/", "filetype": "recursive_paths", "recursed_filetype": "pdf", "pattern": "*pdf", "exclude": ["regex_to_exclude"], "include": ["regex_that_need_to_be_present"]}
1010

1111

1212
# anki deck example

DocToolsLLM/utils/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from DocToolsLLM.utils import *
2+
3+
__all__ = [
4+
'batch_file_loader',
5+
'loaders',
6+
'misc',
7+
'prompts',
8+
]

0 commit comments

Comments
 (0)