Skip to content

Commit 6465e90

Browse files
committed
Migrate main branch from MedCAT to cogstack-nlp
2 parents 2957800 + e185218 commit 6465e90

File tree

235 files changed

+403804
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

235 files changed

+403804
-0
lines changed

medcat-v1/.dockerignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
*.DS_Store
2+
.git
3+
.idea
4+
.dat
5+
venv

medcat-v1/.flake8

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
[flake8]
2+
extend-ignore =
3+
E124,
4+
; closing bracket does not match visual indentation
5+
E127,
6+
; continuation line over-indented for visual indent
7+
E128,
8+
; continuation line under-indented for visual indent
9+
E221,
10+
; multiple spaces before operator
11+
E225,
12+
; missing whitespace around operator
13+
E231,
14+
; missing whitespace after ',' and ':'
15+
E252,
16+
; missing whitespace around parameter equal
17+
E261,
18+
; at least two spaces before inline comment
19+
E265,
20+
; block comment should start with '# '
21+
E272,
22+
; multiple spaces before keyword
23+
E303,
24+
; too many blank lines
25+
E501,
26+
; line too long
27+
W291,
28+
; trailing whitespace
29+
W605,
30+
; invalid escape sequence
31+
32+
per-file-ignores = __init__.py:F401
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# For most projects, this workflow file will not need changing; you simply need
2+
# to commit it to your repository.
3+
#
4+
# You may wish to alter this file to override the set of languages analyzed,
5+
# or to provide custom queries or build logic.
6+
#
7+
# ******** NOTE ********
8+
# We have attempted to detect the languages in your repository. Please check
9+
# the `language` matrix defined below to confirm you have the correct set of
10+
# supported CodeQL languages.
11+
#
12+
name: "CodeQL"
13+
14+
on:
15+
push:
16+
branches: [ "main" ]
17+
pull_request:
18+
branches: [ "main" ]
19+
schedule:
20+
- cron: '36 14 * * 0'
21+
22+
jobs:
23+
analyze:
24+
name: Analyze (${{ matrix.language }})
25+
# Runner size impacts CodeQL analysis time. To learn more, please see:
26+
# - https://gh.io/recommended-hardware-resources-for-running-codeql
27+
# - https://gh.io/supported-runners-and-hardware-resources
28+
# - https://gh.io/using-larger-runners (GitHub.com only)
29+
# Consider using larger runners or machines with greater resources for possible analysis time improvements.
30+
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
31+
timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
32+
permissions:
33+
# required for all workflows
34+
security-events: write
35+
36+
# required to fetch internal or private CodeQL packs
37+
packages: read
38+
39+
# only required for workflows in private repositories
40+
actions: read
41+
contents: read
42+
43+
strategy:
44+
fail-fast: false
45+
matrix:
46+
include:
47+
- language: javascript-typescript
48+
build-mode: none
49+
- language: python
50+
build-mode: none
51+
# CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
52+
# Use `c-cpp` to analyze code written in C, C++ or both
53+
# Use 'java-kotlin' to analyze code written in Java, Kotlin or both
54+
# Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
55+
# To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
56+
# see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
57+
# If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
58+
# your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
59+
steps:
60+
- name: Checkout repository
61+
uses: actions/checkout@v4
62+
63+
# Initializes the CodeQL tools for scanning.
64+
- name: Initialize CodeQL
65+
uses: github/codeql-action/init@v3
66+
with:
67+
languages: ${{ matrix.language }}
68+
build-mode: ${{ matrix.build-mode }}
69+
# If you wish to specify custom queries, you can do so here or in a config file.
70+
# By default, queries listed here will override any specified in a config file.
71+
# Prefix the list here with "+" to use these queries and those in the config file.
72+
73+
# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
74+
# queries: security-extended,security-and-quality
75+
76+
# If the analyze step fails for one of the languages you are analyzing with
77+
# "We were unable to automatically build your code", modify the matrix above
78+
# to set the build mode to "manual" for that language. Then modify this step
79+
# to build your code.
80+
# ℹ️ Command-line programs to run using the OS shell.
81+
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
82+
- if: matrix.build-mode == 'manual'
83+
shell: bash
84+
run: |
85+
echo 'If you are using a "manual" build mode for one or more of the' \
86+
'languages you are analyzing, replace this with the commands to build' \
87+
'your code, for example:'
88+
echo ' make bootstrap'
89+
echo ' make release'
90+
exit 1
91+
92+
- name: Perform CodeQL Analysis
93+
uses: github/codeql-action/analyze@v3
94+
with:
95+
category: "/language:${{matrix.language}}"
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
name: build
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
pull_request:
7+
branches: [ main ]
8+
9+
jobs:
10+
build:
11+
12+
runs-on: ubuntu-latest
13+
strategy:
14+
matrix:
15+
python-version: [ '3.9', '3.10', '3.11', '3.12' ]
16+
max-parallel: 4
17+
18+
steps:
19+
- uses: actions/checkout@v4
20+
- name: Set up Python ${{ matrix.python-version }}
21+
uses: actions/setup-python@v4
22+
with:
23+
python-version: ${{ matrix.python-version }}
24+
- name: Install dependencies
25+
run: |
26+
python -m pip install --upgrade pip
27+
pip install -r requirements-dev.txt
28+
- name: Check types
29+
run: |
30+
python -m mypy --follow-imports=normal medcat
31+
- name: Lint
32+
run: |
33+
flake8 medcat
34+
- name: Pydantic 1 check
35+
# NOTE: the following will look for use of pydantic1-specific .dict() method and .__fields__ attribute
36+
# if there are some (that are not annotated for pydantic1 backwards compatibility) a non-zero exit
37+
# code is returned, which will hald the workflow and print out the offending parts
38+
run: |
39+
grep "\.__fields__" medcat -rI | grep -v "# 4pydantic1 - backwards compatibility" | tee /dev/stderr | test $(wc -l) -eq 0
40+
grep "\.dict(" medcat -rI | grep -v "# 4pydantic1 - backwards compatibility" | tee /dev/stderr | test $(wc -l) -eq 0
41+
- name: Test
42+
run: |
43+
all_files=$(git ls-files | grep '^tests/.*\.py$' | grep -v '/__init__\.py$' | sed 's/\.py$//' | sed 's/\//./g')
44+
num_files=$(echo "$all_files" | wc -l)
45+
midpoint=$((num_files / 2))
46+
first_half_nl=$(echo "$all_files" | head -n $midpoint)
47+
second_half_nl=$(echo "$all_files" | tail -n +$(($midpoint + 1)))
48+
timeout 25m python -m unittest ${first_half_nl[@]}
49+
timeout 25m python -m unittest ${second_half_nl[@]}
50+
- name: Regression
51+
run: source tests/resources/regression/run_regression.sh
52+
- name: Model backwards compatibility
53+
run: source tests/resources/model_compatibility/check_backwards_compatibility.sh
54+
- name: Get the latest release version
55+
id: get_latest_release
56+
uses: actions/github-script@v6
57+
with:
58+
script: |
59+
const latestRelease = await github.rest.repos.getLatestRelease({
60+
owner: context.repo.owner,
61+
repo: context.repo.repo
62+
});
63+
core.setOutput('latest_version', latestRelease.data.tag_name);
64+
- name: Make sure there's no deprecated methods that should be removed.
65+
# only run this for main -> production PR. I.e just before doing a release.
66+
if: github.event.pull_request.base.ref == 'main' && github.event.pull_request.head.ref == 'production'
67+
env:
68+
VERSION: ${{ steps.get_latest_release.outputs.latest_version }}
69+
run: |
70+
python tests/check_deprecations.py "$VERSION" --next-version --remove-prefix
71+
72+
publish-to-test-pypi:
73+
74+
if: |
75+
github.repository == 'CogStack/MedCAT' &&
76+
github.ref == 'refs/heads/main' &&
77+
github.event_name == 'push' &&
78+
startsWith(github.ref, 'refs/tags') != true
79+
runs-on: ubuntu-24.04
80+
timeout-minutes: 45
81+
concurrency: publish-to-test-pypi
82+
needs: [build]
83+
84+
steps:
85+
- name: Checkout main
86+
uses: actions/checkout@v4
87+
with:
88+
ref: 'main'
89+
fetch-depth: 0
90+
91+
- name: Set up Python 3.9
92+
uses: actions/setup-python@v4
93+
with:
94+
python-version: 3.9
95+
96+
- name: Install pypa/build
97+
run: >-
98+
python -m
99+
pip install
100+
build
101+
--user
102+
103+
- name: Configure the version
104+
run: >-
105+
sed --in-place
106+
"s/node-and-date/no-local-version/g"
107+
setup.py
108+
109+
- name: Build a binary wheel and a source tarball
110+
run: >-
111+
python -m
112+
build
113+
--sdist
114+
--wheel
115+
--outdir dist/
116+
.
117+
118+
- name: Publish dev distribution to Test PyPI
119+
uses: pypa/[email protected]
120+
with:
121+
password: ${{ secrets.TEST_PYPI_API_TOKEN }}
122+
repository_url: https://test.pypi.org/legacy/
123+
continue-on-error: true
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
name: production
2+
3+
on:
4+
push:
5+
branches: [ production, "v[0-9]+.[0-9]+.post" ]
6+
release:
7+
types: [ published , edited ]
8+
9+
jobs:
10+
build-n-publish-to-pypi:
11+
runs-on: ubuntu-24.04
12+
concurrency: build-n-publish-to-pypi
13+
if: github.repository == 'CogStack/MedCAT'
14+
15+
steps:
16+
- name: Checkout production
17+
uses: actions/checkout@v4
18+
with:
19+
ref: ${{ github.event.release.target_commitish }}
20+
fetch-depth: 0
21+
22+
- name: Set up Python 3.9
23+
uses: actions/setup-python@v4
24+
with:
25+
python-version: 3.9
26+
27+
- name: Run UATs
28+
run: |
29+
python -m pip install --upgrade pip
30+
pip install -r requirements-dev.txt
31+
all_files=$(git ls-files | grep '^tests/.*\.py$' | grep -v '/__init__\.py$' | sed 's/\.py$//' | sed 's/\//./g')
32+
num_files=$(echo "$all_files" | wc -l)
33+
midpoint=$((num_files / 2))
34+
first_half_nl=$(echo "$all_files" | head -n $midpoint)
35+
second_half_nl=$(echo "$all_files" | tail -n +$(($midpoint + 1)))
36+
timeout 25m python -m unittest ${first_half_nl[@]}
37+
timeout 25m python -m unittest ${second_half_nl[@]}
38+
39+
- name: Install pypa/build
40+
run: >-
41+
python -m
42+
pip install
43+
build
44+
--user
45+
46+
- name: Build a binary wheel and a source tarball
47+
run: >-
48+
python -m
49+
build
50+
--sdist
51+
--wheel
52+
--outdir dist/
53+
.
54+
55+
- name: Publish production distribution to PyPI
56+
if: startsWith(github.ref, 'refs/tags') && ! github.event.release.prerelease
57+
uses: pypa/[email protected]
58+
with:
59+
password: ${{ secrets.PYPI_API_TOKEN }}

medcat-v1/.gitignore

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#Directories to be ignored fully
2+
/books/
3+
/articles/
4+
/other/
5+
/output/
6+
/graphics/
7+
/models/*.dat
8+
/notebooks/wandb/
9+
/notebooks/logs/
10+
/notebooks/results/
11+
dist/
12+
tmp/
13+
*_tmp/
14+
medcat.egg-info/
15+
build/
16+
.idea
17+
venv
18+
db.sqlite3
19+
.ipynb_checkpoints
20+
21+
# vscode
22+
.vscode
23+
24+
#tmp and similar files
25+
.nfs*
26+
*.log
27+
*.pyc
28+
*.out
29+
*.swp
30+
*.swn
31+
tmp_*
32+
t_*
33+
tmp_*
34+
*_tmp
35+
*.swo
36+
*.lyx.emergency
37+
*.lyx#
38+
*~
39+
*hidden*
40+
nohup.out
41+
tmp.py
42+
.DS_Store
43+
*.lock
44+
*.egg*
45+
46+
# models files
47+
*.dat
48+
!examples/*.dat
49+
./checkpoints/
50+
51+
# Test output
52+
tests/model_creator/output/*
53+
54+
# docs outputs
55+
docs/auto/
56+
docs/_build
57+
58+
models/

medcat-v1/.readthedocs.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# .readthedocs.yaml
2+
# Read the Docs configuration file
3+
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4+
5+
version: 2
6+
7+
build:
8+
os: ubuntu-20.04
9+
tools:
10+
python: "3.10"
11+
12+
sphinx:
13+
configuration: docs/conf.py
14+
15+
python:
16+
install:
17+
- requirements: docs/requirements.txt
18+
- method: pip
19+
path: .

0 commit comments

Comments
 (0)