Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ jobs:
strategy:
max-parallel: 2
matrix:
python-version: [ 3.7, 3.8, 3.9, '3.10' ]
python-version: [3.9, "3.10", "3.11", "3.12", "3.13"]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Set up python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install System Dependencies
Expand Down
216 changes: 216 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
# Pipfile.lock

# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# uv.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
# poetry.lock
# poetry.toml

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
# pdm.lock
# pdm.toml
.pdm-python
.pdm-build/

# pixi
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
# pixi.lock
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
# in the .venv directory. It is recommended not to include this directory in version control.
.pixi

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# Redis
*.rdb
*.aof
*.pid

# RabbitMQ
mnesia/
rabbitmq/
rabbitmq-data/

# ActiveMQ
activemq-data/

# SageMath parsed files
*.sage.py

# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
# .idea/

# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/

# Visual Studio Code
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
# and can be added to the global gitignore or merged into this file. However, if you prefer,
# you could uncomment the following to ignore the entire vscode folder
# .vscode/

# Ruff stuff:
.ruff_cache/

# PyPI configuration file
.pypirc

# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/

# Streamlit
.streamlit/secrets.toml
58 changes: 39 additions & 19 deletions padacioso/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import concurrent.futures
from typing import List, Iterator, Optional

import simplematch
Expand All @@ -15,7 +14,6 @@

from difflib import SequenceMatcher


def fuzzy_match(x, against):
"""Perform a 'fuzzy' comparison between two strings.
Returns:
Expand All @@ -38,6 +36,10 @@ def __init__(self, fuzz=False, n_workers=4):
self.excluded_keywords = {}
self.excluded_contexts = {}

# Cache for optimization - pre-built list for fast iteration
self._intent_list = [] # Pre-built list of (intent_name, regexes)
self._cache_dirty = True # Flag to rebuild cache on next query

if "word" not in simplematch.types:
LOG.debug(f"Registering `word` type")
_init_sm_word_type()
Expand Down Expand Up @@ -93,6 +95,7 @@ def remove_intent(self, name: str):
self._cased_matchers.pop(rx)
if rx in self._uncased_matchers:
self._uncased_matchers.pop(rx)
self._cache_dirty = True # Mark cache as needing rebuild

def add_entity(self, name: str, lines: List[str]):
"""
Expand All @@ -118,6 +121,15 @@ def remove_entity(self, name: str):
if name in self.entity_samples:
del self.entity_samples[name]

def _rebuild_cache(self):
"""
Rebuild cached intent metadata for fast filtering.
Called lazily on first query after registration to avoid O(n²) during bulk registration.
"""
# Pre-build the intent list to avoid reconstructing it every query
self._intent_list = list(self.intent_samples.items())
self._cache_dirty = False

def _filter(self, query: str):
# filter intents based on context/excluded keywords
excluded_intents = []
Expand Down Expand Up @@ -205,28 +217,33 @@ def _fuzzy_score(self, query, s, penalty=0.25):
score = (fuzzy_score + base_score) / 2

if entities is not None:
return {"entities": entities or {},
"conf": (fuzzy_score + base_score) / 2}
return {"entities": entities or {}, "conf": score}

def calc_intents(self, query: str) -> Iterator[dict]:
"""
Determine possible intents for a given query
@param query: input to evaluate for an intent match
@return: yields dict intent matches
"""
# filter intents based on context/excluded keywords
# Lazy cache rebuild - only rebuild once after bulk registration
# This avoids O(n²) scaling during registration (rebuild on every add)
if self._cache_dirty:
self._rebuild_cache()

# Filter based on runtime context/keywords (query and session dependent)
excluded_intents = self._filter(query)

# do the work in parallel instead of sequentially
with concurrent.futures.ProcessPoolExecutor(max_workers=self.workers) as executor:
future_to_source = {
executor.submit(self._match, query, intent_name, regexes): intent_name
for intent_name, regexes in self.intent_samples.items() if intent_name not in excluded_intents
}
for future in concurrent.futures.as_completed(future_to_source):
res = future.result()
if res is not None:
yield res
# Sequential processing - threading overhead > actual work for regex matching
for intent_name, regexes in self._intent_list:
if intent_name in excluded_intents:
continue
res = self._match(query, intent_name, regexes)
if res is not None:
yield res
# Early exit optimization: perfect match found
# TODO: Some validation that we don't have duplicates, and warning if we do
if res.get("conf", 0) == 1.0:
return

def calc_intent(self, query: str) -> Optional[dict]:
"""
Expand Down Expand Up @@ -260,6 +277,7 @@ def exclude_keywords(self, intent_name, samples):
self.excluded_keywords[intent_name] = samples
else:
self.excluded_keywords[intent_name] += samples
self._cache_dirty = True # Mark cache as needing rebuild

def set_context(self, intent_name, context_name, context_val=None):
if intent_name not in self.available_contexts:
Expand All @@ -271,11 +289,12 @@ def exclude_context(self, intent_name, context_name):
self.excluded_contexts[intent_name] = [context_name]
else:
self.excluded_contexts[intent_name].append(context_name)
self._cache_dirty = True # Mark cache as needing rebuild

def unexclude_context(self, intent_name, context_name):
if intent_name in self.excluded_contexts:
self.excluded_contexts[intent_name] = [c for c in self.excluded_contexts[intent_name]
if context_name != c]
self.excluded_contexts[intent_name] = [c for c in self.excluded_contexts[intent_name] if context_name != c]
self._cache_dirty = True # Mark cache as needing rebuild

def unset_context(self, intent_name, context_name):
if intent_name in self.available_contexts:
Expand All @@ -287,11 +306,12 @@ def require_context(self, intent_name, context_name):
self.required_contexts[intent_name] = [context_name]
else:
self.required_contexts[intent_name].append(context_name)
self._cache_dirty = True # Mark cache as needing rebuild

def unrequire_context(self, intent_name, context_name):
if intent_name in self.required_contexts:
self.required_contexts[intent_name] = [c for c in self.required_contexts[intent_name]
if context_name != c]
self.required_contexts[intent_name] = [c for c in self.required_contexts[intent_name] if context_name != c]
self._cache_dirty = True # Mark cache as needing rebuild


def _init_sm_word_type():
Expand Down
Loading