Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ You should install Python 3.11 first.

```bash
curl -sSL https://install.python-poetry.org | python3 -
poetry self add poetry-plugin-shell
poetry self add poetry-plugin-shell
```

* install dependencies
Expand Down Expand Up @@ -50,7 +50,7 @@ poetry shell
```

```
PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python celery -A config.celery worker -l INFO --pool gevent
PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python celery -A config.celery worker -l INFO --pool threads
```

* run the frontend
Expand Down
7 changes: 3 additions & 4 deletions aperag/auth/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from aperag.auth import tv
from aperag.utils.constant import KEY_USER_ID, KEY_WEBSOCKET_PROTOCOL
from aperag.db.models import ApiKeyToken, ApiKeyStatus
from ninja.compatibility.request import get_headers
from django.core.cache import cache
from asgiref.sync import sync_to_async

Expand Down Expand Up @@ -59,7 +58,7 @@ async def get_user_from_api_key(key):
return None
if api_key.status == ApiKeyStatus.DELETED:
return None

cache.set(cache_key, api_key.user)
return api_key.user

Expand All @@ -77,7 +76,7 @@ class GlobalHTTPAuth(HttpAuthBase):
openapi_scheme: str = "bearer"
header: str = "Authorization"
async def __call__(self, request: HttpRequest) -> Optional[Any]:
headers = get_headers(request)
headers = request.headers
auth_value = headers.get(self.header)
if not auth_value:
return None
Expand All @@ -89,7 +88,7 @@ async def __call__(self, request: HttpRequest) -> Optional[Any]:
return None
token = " ".join(parts[1:])
return await self.authenticate(request, token, parts[0].lower())

async def authenticate(self, request, token, scheme):
if scheme == self.openapi_scheme:
request.META[KEY_USER_ID] = get_user_from_token(token)
Expand Down
12 changes: 6 additions & 6 deletions aperag/db/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,12 @@


class PagedQuery(BaseModel):
page_number: Optional[int]
page_size: Optional[int]
match_key: Optional[str]
match_value: Optional[str]
order_by: Optional[str]
order_desc: Optional[bool]
page_number: Optional[int] = None
page_size: Optional[int] = None
match_key: Optional[str] = None
match_value: Optional[str] = None
order_by: Optional[str] = None
order_desc: Optional[bool] = None


class PagedResult(BaseModel):
Expand Down
30 changes: 15 additions & 15 deletions aperag/pipeline/base_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,19 @@

class Message(BaseModel):
id: str
query: Optional[str]
timestamp: Optional[int]
response: Optional[str]
urls: Optional[List[str]]
references: Optional[List[Dict]]
collection_id: Optional[str]
embedding_model: Optional[str]
embedding_size: Optional[int]
embedding_score_threshold: Optional[float]
embedding_topk: Optional[int]
llm_model: Optional[str]
llm_prompt_template: Optional[str]
llm_context_window: Optional[int]
query: Optional[str] = None
timestamp: Optional[int] = None
response: Optional[str] = None
urls: Optional[List[str]] = None
references: Optional[List[Dict]] = None
collection_id: Optional[str] = None
embedding_model: Optional[str] = None
embedding_size: Optional[int] = None
embedding_score_threshold: Optional[float] = None
embedding_topk: Optional[int] = None
llm_model: Optional[str] = None
llm_prompt_template: Optional[str] = None
llm_context_window: Optional[int] = None


KUBE_CHAT_DOC_QA_REFERENCES = "|KUBE_CHAT_DOC_QA_REFERENCES|"
Expand Down Expand Up @@ -124,7 +124,7 @@ async def generate_related_question(self, related_question_prompt):
if question:
related_questions.append(question)
else:
related_questions = []
related_questions = []
if content=='':
return related_questions
questions = re.sub(r'\n+', '\n', content).split('\n')
Expand All @@ -137,7 +137,7 @@ async def generate_related_question(self, related_question_prompt):
question = match.group(1)
related_questions.append(question)
return related_questions

@staticmethod
async def new_human_message(message, message_id):
return Message(
Expand Down
10 changes: 5 additions & 5 deletions aperag/readers/base_readers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from typing import Dict, Type

from llama_index.readers.base import BaseReader
from llama_index.readers.file.docs_reader import DocxReader, PDFReader
from llama_index.readers.file.ipynb_reader import IPYNBReader
from llama_index.readers.file.mbox_reader import MboxReader
from llama_index.readers.file.tabular_reader import PandasCSVReader
from llama_index.core.readers.base import BaseReader
from llama_index.readers.file.docs.base import DocxReader, PDFReader
from llama_index.readers.file.ipynb import IPYNBReader
from llama_index.readers.file.mbox import MboxReader
from llama_index.readers.file.tabular import PandasCSVReader

from aperag.readers.compose_audio_reader import ComposeAudioReader
from aperag.readers.compose_image_reader import ComposeImageReader
Expand Down
4 changes: 2 additions & 2 deletions aperag/readers/compose_audio_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from typing import Dict, List, Optional

import requests
from llama_index.readers.base import BaseReader
from llama_index.schema import Document
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document

import config.settings as settings

Expand Down
4 changes: 2 additions & 2 deletions aperag/readers/compose_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

import PIL.Image
import requests
from llama_index.readers.base import BaseReader
from llama_index.schema import Document, ImageDocument
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document, ImageDocument
from PIL import Image

import config.settings as settings
Expand Down
4 changes: 2 additions & 2 deletions aperag/readers/compressed_file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from pathlib import Path
from typing import Dict, List, Optional

from llama_index.readers.base import BaseReader
from llama_index.schema import Document
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document

logger = logging.getLogger(__name__)
class CompressedFileReader(BaseReader):
Expand Down
6 changes: 3 additions & 3 deletions aperag/readers/doc_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from pathlib import Path
from typing import Dict, List, Optional

from llama_index.readers.base import BaseReader
from llama_index.readers.file.docs_reader import DocxReader
from llama_index.schema import Document
from llama_index.core.readers.base import BaseReader
from llama_index.readers.file.docs import DocxReader
from llama_index.core.schema import Document

logger = logging.getLogger(__name__)

Expand Down
4 changes: 2 additions & 2 deletions aperag/readers/docx_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from docx.oxml import CT_P, CT_Tbl
from docx.table import Table, _Cell
from docx.text.paragraph import Paragraph
from llama_index.readers.base import BaseReader
from llama_index.schema import Document
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document

from aperag.utils.utils import Stacks

Expand Down
4 changes: 2 additions & 2 deletions aperag/readers/epub_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from pathlib import Path
from typing import Dict, List, Optional

from llama_index.readers.base import BaseReader
from llama_index.schema import Document
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document


class EpubReader(BaseReader):
Expand Down
4 changes: 2 additions & 2 deletions aperag/readers/excel_reader.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from pathlib import Path
from typing import Dict, List, Optional

from llama_index import Document
from llama_index.readers.base import BaseReader
from llama_index.core.schema import Document
from llama_index.core.readers.base import BaseReader


class ExcelReader(BaseReader):
Expand Down
4 changes: 2 additions & 2 deletions aperag/readers/html_reader.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from pathlib import Path
from typing import Dict, List, Optional

from llama_index import Document
from llama_index.readers.base import BaseReader
from llama_index.core.schema import Document
from llama_index.core.readers.base import BaseReader


class HtmlReader(BaseReader):
Expand Down
23 changes: 9 additions & 14 deletions aperag/readers/local_path_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@
from typing import Any, Dict, List, Optional, Tuple

from langchain.embeddings.base import Embeddings
from llama_index.data_structs.data_structs import BaseNode
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.node_parser import NodeParser, SimpleNodeParser
from llama_index.vector_stores.types import NodeWithEmbedding
from llama_index.core.data_structs.data_structs import BaseNode
from llama_index.core.node_parser import NodeParser, TokenTextSplitter

from aperag.db.models import ProtectAction
from aperag.readers.base_embedding import DocumentBaseEmbedding
Expand Down Expand Up @@ -45,12 +43,10 @@ def metadata_mapping_func(path: str) -> Dict[str, Any]:
self.reader = InteractiveSimpleDirectoryReader(**kwargs)
self.filter = SensitiveFilterClassify()
self.node_parser = node_parser or \
SimpleNodeParser(
text_splitter=TokenTextSplitter(
chunk_size=kwargs.get('chunk_size', 1024),
chunk_overlap=kwargs.get('chunk_overlap', 20),
tokenizer=get_default_tokenizer(),
)
TokenTextSplitter(
chunk_size=kwargs.get('chunk_size', 1024),
chunk_overlap=kwargs.get('chunk_overlap', 20),
tokenizer=get_default_tokenizer(),
)

def load_data(self, **kwargs) -> Tuple[List[str], str, List]:
Expand All @@ -61,15 +57,14 @@ def load_data(self, **kwargs) -> Tuple[List[str], str, List]:
return [], "", []

nodes: List[BaseNode] = []
nodes_with_embedding: List[NodeWithEmbedding] = []

texts = []
content = ""
sensitive_info = []

for doc in docs:
content += doc.text
doc.text = doc.text.strip()
doc.set_content(doc.text.strip())

# ignore page less than 30 characters
text_size_threshold = 30
Expand Down Expand Up @@ -142,11 +137,11 @@ def load_data(self, **kwargs) -> Tuple[List[str], str, List]:
vectors = self.embedding.embed_documents(texts)

for i in range(len(vectors)):
nodes_with_embedding.append(NodeWithEmbedding(node=nodes[i], embedding=vectors[i]))
nodes[i].embedding = vectors[i]

print(f"processed file: {file_name} ")

return self.connector.store.add(nodes_with_embedding), content, sensitive_info
return self.connector.store.add(nodes), content, sensitive_info

def delete(self, **kwargs) -> bool:
return self.connector.delete(**kwargs)
6 changes: 3 additions & 3 deletions aperag/readers/local_path_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import os
from typing import Callable, Dict, List, Optional

from llama_index.readers.base import BaseReader
from llama_index.readers.file.base import SimpleDirectoryReader
from llama_index.readers.schema.base import Document
from llama_index.core.readers.base import BaseReader
from llama_index.core.readers.file.base import SimpleDirectoryReader
from llama_index.core.schema import Document

from aperag.readers.base_readers import DEFAULT_FILE_READER_CLS

Expand Down
4 changes: 2 additions & 2 deletions aperag/readers/markdown_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, cast

from llama_index.readers.base import BaseReader
from llama_index.schema import Document
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
from mdsplit import Chapter, split_by_heading
from pydantic import BaseModel

Expand Down
4 changes: 2 additions & 2 deletions aperag/readers/ppt_reader.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from pathlib import Path
from typing import Dict, List, Optional

from llama_index import Document
from llama_index.readers.base import BaseReader
from llama_index.core.schema import Document
from llama_index.core.readers.base import BaseReader


class PptReader(BaseReader):
Expand Down
4 changes: 2 additions & 2 deletions aperag/readers/pptx_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from pathlib import Path
from typing import Dict, List, Optional

from llama_index.readers.base import BaseReader
from llama_index.schema import Document
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document

# model_id_or_path = "nlpconnect/vit-gpt2-image-captioning"
model_id_or_path = (
Expand Down
8 changes: 4 additions & 4 deletions aperag/readers/qa_embedding.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

from llama_index import LangchainEmbedding
from llama_index.data_structs import Node
from llama_index.schema import NodeRelationship, RelatedNodeInfo
from llama_index.vector_stores.types import NodeWithEmbedding
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core.data_structs import Node
from llama_index.core.schema import NodeRelationship, RelatedNodeInfo
from llama_index.core.vector_stores.types import NodeWithEmbedding

from aperag.readers.base_embedding import DocumentBaseEmbedding
from aperag.vectorstore.connector import VectorStoreConnectorAdaptor
Expand Down
Loading
Loading