Skip to content

Commit 5f63f03

Browse files
Merge branch 'dev'
2 parents 15bfcca + d80d1c0 commit 5f63f03

32 files changed

+1749
-1380
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ author_dir
77
pages
88
.env
99
**/__pycache__
10-
DocToolsLLM.egg-info
10+
*.egg-info
1111
build
1212
.aider*

DocToolsLLM/utils/typechecker.py

Lines changed: 0 additions & 25 deletions
This file was deleted.

README.md

Lines changed: 59 additions & 52 deletions
Large diffs are not rendered by default.

DocToolsLLM/DocToolsLLM.py renamed to WinstonDoc/WinstonDoc.py

Lines changed: 133 additions & 167 deletions
Large diffs are not rendered by default.

DocToolsLLM/__init__.py renamed to WinstonDoc/__init__.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
import sys
66
import fire
77

8-
from .DocToolsLLM import DocToolsLLM_class as DocToolsLLM
8+
from .WinstonDoc import WinstonDoc
99

1010
__all__ = [
11-
"DocToolsLLM",
11+
"WinstonDoc",
1212
"cli_launcher",
1313
"utils",
1414
]
1515

16-
__VERSION__ = DocToolsLLM.VERSION
16+
__VERSION__ = WinstonDoc.VERSION
1717

1818

1919
def fire_wrapper(
@@ -25,17 +25,17 @@ def fire_wrapper(
2525
# --help but not catched by sys.argv
2626
if "help" in kwargs and kwargs["help"]:
2727
print("Showing help")
28-
DocToolsLLM.md_printer(DocToolsLLM.__doc__)
28+
WinstonDoc.md_printer(WinstonDoc.__doc__)
2929
raise SystemExit()
3030

3131
# no args given
3232
if not any([args, kwargs]):
3333
print("Empty arguments, showing help")
34-
DocToolsLLM.md_printer(DocToolsLLM.__doc__)
34+
WinstonDoc.md_printer(WinstonDoc.__doc__)
3535
raise SystemExit()
3636

3737
# while we're at it, make it so that
38-
# "DocToolsLLM summary" is parsed like "DocToolsLLM --task=summary"
38+
# "WinstonDoc summary" is parsed like "WinstonDoc --task=summary"
3939
args = list(args)
4040
if args and isinstance(args[0], str):
4141
if args[0].replace("summary", "summarize") in ["query", "search", "summarize", "summarize_then_query"]:
@@ -67,12 +67,14 @@ def fire_wrapper(
6767

6868
def cli_launcher() -> None:
6969
sys_args = sys.argv
70+
if "--version" in sys_args:
71+
return __VERSION__
7072
if "--help" in sys_args:
7173
print("Showing help")
72-
DocToolsLLM.md_printer(DocToolsLLM.__doc__)
74+
WinstonDoc.md_printer(WinstonDoc.__doc__)
7375
raise SystemExit()
74-
if "--completion" in sys_args:
75-
return fire.Fire(DocToolsLLM)
76+
if "--" in sys_args and "--completion" in sys_args:
77+
return fire.Fire(WinstonDoc)
7678

7779
kwargs = fire.Fire(fire_wrapper)
78-
instance = DocToolsLLM(**kwargs)
80+
instance = WinstonDoc(**kwargs)

DocToolsLLM/__main__.py renamed to WinstonDoc/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Entry point used when DocToolsLLM is imported or called by 'python -m DocToolsLLM'.
2+
Entry point used when WinstonDoc is imported or called by 'python -m WinstonDoc'.
33
Does the same as __init__.py
44
"""
55

DocToolsLLM/docs/USAGE.md renamed to WinstonDoc/docs/USAGE.md

Lines changed: 19 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,19 @@
77
* `summarize`: means the input will be passed through a summarization prompt.
88
* `summarize_then_query`: summarize the text then open the prompt to allow querying directly the source document.
99

10-
* `--filetype`: str, default `infer`
10+
* `--filetype`: str, default `auto`
1111
* the type of input. Depending on the value, different other parameters
1212
are needed. If json_entries is used, the line of the input file can contain
1313
any of those parameters as long as they are as json. You can find
14-
an example of json_entries file in `DocToolsLLM/docs/json_entries_example.txt`
14+
an example of json_entries file in `WinstonDoc/docs/json_entries_example.txt`
1515

1616
* Supported values:
17-
* `infer`: will guess the appropriate filetype based on `--path`.
17+
* `auto`: will guess the appropriate filetype based on `--path`.
1818
Irrelevant for some filetypes, eg if `--filetype`=anki
1919
* `youtube`: `--path` must link to a youtube video
2020
* `youtube_playlist`: `--path` must link to a youtube playlist
2121
* `pdf`: `--path` is path to pdf
22-
* `txt`: `--path` is path to txt
22+
* `text`: `--path` is path to a .txt file
2323
* `url`: `--path` must be a valid http(s) link
2424
* `anki`: must be set: `--anki_profile`. Optional: `--anki_deck`,
2525
`--anki_notetype`, `--anki_template`, `--anki_tag_filter`.
@@ -35,7 +35,7 @@
3535
be downloaded. Possible arguments are `--onlinemedia_url_regex`,
3636
`--onlinemedia_resourcetype_regex`. Then arguments of `local_audio`.
3737

38-
* `json_entries`: `--path` is path to a txt file that contains a json
38+
* `json_entries`: `--path` is path to a text file that contains a json
3939
for each line containing at least a filetype and a path key/value
4040
but can contain any parameters described here
4141
* `recursive_paths`: `--path` is the starting path `--pattern` is the globbing
@@ -118,14 +118,14 @@
118118
if contains `hyde` but modelname contains `testing` then `hyde` will
119119
be removed.
120120

121-
* `--query_eval_modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet:beta"`
121+
* `--query_eval_modelname`: str, default `"openai/gpt4o-mini"`
122122
* Cheaper and quicker model than modelname. Used for intermediate
123123
steps in the RAG, not used in other tasks.
124124
If the value is not part of the model list of litellm, will use
125125
fuzzy matching to find the best match.
126126
None to disable.
127127

128-
* `--query_eval_check_number`: int, default `1`
128+
* `--query_eval_check_number`: int, default `4`
129129
* number of pass to do with the eval llm to check if the document
130130
is indeed relevant to the question. The document will not
131131
be processed if all answers from the eval llm are 0, and will
@@ -137,13 +137,6 @@
137137
* threshold underwhich a document cannot be considered relevant by
138138
embeddings alone.
139139

140-
* `--query_condense_question`: bool, default `True`
141-
* if True, will not use a special LLM call to reformulate the question
142-
when task is `query`. Otherwise, the query will be reformulated as
143-
a standalone question. Useful when you have multiple questions in
144-
a row.
145-
Disabled if using a testing model.
146-
147140
---
148141

149142
* `--summary_n_recursion`: int, default `1`
@@ -187,15 +180,11 @@
187180
can be used for example to send notification on your phone
188181
using ntfy.sh to get summaries.
189182

190-
* `--memoryless`: bool, default `False`
191-
* if False, will remember the messages across a given chat exchange.
192-
Disabled if using a testing model.
193-
194183
* `--disable_llm_cache`: bool, default `False`
195184
* WARNING: The cache is temporarily ignored in non openaillms
196185
generations because of an error with langchain's ChatLiteLLM.
197186
Basically if you don't use `--private` and use llm form openai,
198-
DocToolsLLM will use ChatOpenAI with regular caching, otherwise
187+
WinstonDoc will use ChatOpenAI with regular caching, otherwise
199188
we use ChatLiteLLM with LLM caching disabled.
200189
More at https://github.com/langchain-ai/langchain/issues/22389
201190

@@ -243,7 +232,7 @@
243232
to a loader. They apply depending on the value of `--filetype`.
244233
An unexpected argument for a given filetype will result in a crash.
245234

246-
* `--path`: str
235+
* `--path`: str or PosixPath
247236
* Used by most loaders. For example for `--filetype=youtube` the path
248237
must point to a youtube video.
249238

@@ -311,14 +300,13 @@
311300
Either 'youtube', 'whisper' or 'deepgram'.
312301
Default is 'youtube'.
313302
* If 'youtube': will take the youtube transcripts as text content.
314-
* If 'whisper': DocToolsLLM will download
303+
* If 'whisper': WinstonDoc will download
315304
the audio from the youtube link, and whisper will be used to turn the audio into text. whisper_prompt and whisper_lang will be used if set.
316305
* If 'deepgram' will download
317306
the audio from the youtube link, and deepgram will be used to turn the audio into text. `--deepgram_kwargs` will be used if set.
318307

319308
* `--include`: str
320-
* Only active if `--filetype` is one of 'json_entries', 'recursive_paths',
321-
'link_file', 'youtube_playlist'.
309+
* Only active if `--filetype` is 'recursive_paths'
322310
`--include` can be a list of regex that must be present in the
323311
document PATH (not content!)
324312
`--exclude` can be a list of regex that if present in the PATH
@@ -329,10 +317,10 @@
329317

330318
# Other specific arguments
331319

332-
* `--out_file`: str, default `None`
333-
* If doctools must create a summary, if out_file given the summary will
320+
* `--out_file`: str or PosixPath, default `None`
321+
* If WinstonDoc must create a summary, if out_file given the summary will
334322
be written to this file. Note that the file is not erased and
335-
Doctools will simply append to it.
323+
WinstonDoc will simply append to it.
336324
* If `--summary_n_recursion` is used, additional files will be
337325
created with the name `{out_file}.n.md` with n being the n-1th recursive
338326
summary.
@@ -379,10 +367,10 @@
379367
each document instead of the metadata.
380368
Syntax: `[+-]your_regex`
381369
Example:
382-
* Keep only the document that contain `doctools`
383-
`--filter_content=+.*doctools.*`
384-
* Discard the document that contain `DOCTOOLS`
385-
`--filter_content=-.*DOCTOOLS.*`
370+
* Keep only the document that contain `winstondoc`
371+
`--filter_content=+.*winstondoc.*`
372+
* Discard the document that contain `winstondoc`
373+
`--filter_content=-.*winstondoc.*`
386374

387375
* `--embed_instruct`: bool, default `None`
388376
* when loading an embedding model using HuggingFace or
@@ -436,7 +424,7 @@
436424

437425
# Runtime flags
438426

439-
* `DOCTOOLS_TYPECHECKING`
427+
* `WINSTONDOC_TYPECHECKING`
440428
* Setting for runtime type checking. Default value is `warn`. * Possible values:
441429
The typing is checked using [beartype](https://beartype.readthedocs.io/en/latest/) so shouldn't slow down the runtime.
442430
* `disabled`: disable typechecking.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[[this_will_fetch_all_the_pdf_recursively_inside_a_dir_according_to_some_parameters]]
2+
path = '../some/path/to/parent/'
3+
filetype = 'recursive_paths'
4+
recursed_filetype = 'pdf'
5+
pattern = '*pdf'
6+
exclude = ['regex_to_exclude']
7+
include = ['regex_that_need_to_be_present']
8+
9+
[[anki_deck_example]]
10+
filetype = 'anki'
11+
anki_profile = 'name_of_your_anki_profile_for_ankipandas'
12+
anki_deck = 'personnal::paintings'
13+
anki_notetype = 'my_note_type'
14+
anki_template = '''
15+
Question:{question_field}
16+
Answer:{answer_field}'''

DocToolsLLM/utils/__init__.py renamed to WinstonDoc/utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from DocToolsLLM.utils import *
1+
from WinstonDoc.utils import *
22

33
__all__ = [
44
'batch_file_loader',

0 commit comments

Comments
 (0)