Skip to content
This repository was archived by the owner on Feb 16, 2026. It is now read-only.

Commit 5f8383f

Browse files
authored
Add PHP Language Support (#51)
1 parent 8dc93d4 commit 5f8383f

File tree

7 files changed

+43
-11
lines changed

7 files changed

+43
-11
lines changed

codeqai/app.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,15 @@
88
from rich.console import Console
99
from rich.markdown import Markdown
1010
from rich.syntax import Syntax
11+
from pygments.lexers import PhpLexer
1112
from streamlit.web import cli as stcli
1213
from yaspin import yaspin
1314

1415
from codeqai import codeparser, repo, utils
1516
from codeqai.bootstrap import bootstrap
1617
from codeqai.cache import create_cache_dir, get_cache_path, save_vector_cache
1718
from codeqai.config import create_config, get_config_path, load_config
18-
from codeqai.constants import DistillationMode, EmbeddingsModel, LlmHost
19+
from codeqai.constants import DistillationMode, EmbeddingsModel, Language, LlmHost
1920
from codeqai.dataset_extractor import DatasetExtractor
2021
from codeqai.embeddings import Embeddings
2122
from codeqai.vector_store import VectorStore
@@ -243,14 +244,26 @@ def run():
243244
doc.metadata["filename"], doc.page_content
244245
)
245246

246-
syntax = Syntax(
247-
indentation + doc.page_content,
248-
language.value,
249-
theme="monokai",
250-
line_numbers=True,
251-
start_line=start_line,
252-
indent_guides=True,
253-
)
247+
# PHP needs startinline=True since code snippets don't have <?php tag
248+
if language == Language.PHP:
249+
lexer = PhpLexer(startinline=True)
250+
syntax = Syntax(
251+
indentation + doc.page_content,
252+
lexer=lexer,
253+
theme="monokai",
254+
line_numbers=True,
255+
start_line=start_line,
256+
indent_guides=True,
257+
)
258+
else:
259+
syntax = Syntax(
260+
indentation + doc.page_content,
261+
language.value,
262+
theme="monokai",
263+
line_numbers=True,
264+
start_line=start_line,
265+
indent_guides=True,
266+
)
254267
print(
255268
doc.metadata["filename"] + " -> " + doc.metadata["method_name"]
256269
)

codeqai/codeparser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def parse_code_files_for_db(code_files: list[str]) -> list[Document]:
2323
documents = []
2424
code_splitter = None
2525
for code_file in code_files:
26-
with open(code_file, "r", encoding="utf-8") as file:
26+
with open(code_file, "r", encoding="utf-8", errors="replace") as file:
2727
file_bytes = file.read().encode()
2828
commit_hash = repo.get_commit_hash(code_file)
2929

@@ -87,7 +87,7 @@ def parse_code_files_for_finetuning(
8787
output_tokens = 0
8888
documents = []
8989
for code_file in code_files:
90-
with open(code_file, "r", encoding="utf-8") as file:
90+
with open(code_file, "r", encoding="utf-8", errors="replace") as file:
9191
file_bytes = file.read().encode()
9292

9393
file_extension = utils.get_file_extension(code_file)

codeqai/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class Language(Enum):
1717
LUA = "lua"
1818
HASKELL = "haskell"
1919
RUBY = "ruby"
20+
PHP = "php"
2021
UNKNOWN = "unknown"
2122

2223

codeqai/repo.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def get_commit_hash(file_path):
123123
"cdk.out",
124124
".aws-sam",
125125
".terraform",
126+
"vendor",
126127
]
127128
WHITELIST_FILES = [
128129
".js",

codeqai/treesitter/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@
1212
from codeqai.treesitter.treesitter_rs import TreesitterRust
1313
from codeqai.treesitter.treesitter_ts import TreesitterTypescript
1414
from codeqai.treesitter.treesitter_hs import TreesitterHaskell
15+
from codeqai.treesitter.treesitter_php import TreesitterPHP
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from codeqai.constants import Language
2+
from codeqai.treesitter.treesitter import Treesitter
3+
from codeqai.treesitter.treesitter_registry import TreesitterRegistry
4+
5+
6+
class TreesitterPHP(Treesitter):
7+
def __init__(self):
8+
super().__init__(
9+
Language.PHP, "method_declaration", "name", "comment"
10+
)
11+
12+
13+
TreesitterRegistry.register_treesitter(Language.PHP, TreesitterPHP)

codeqai/utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def get_programming_language(file_extension: str) -> Language:
3535
".cs": Language.C_SHARP,
3636
".hs": Language.HASKELL,
3737
".rb": Language.RUBY,
38+
".php": Language.PHP,
3839
}
3940
return language_mapping.get(file_extension, Language.UNKNOWN)
4041

@@ -76,6 +77,8 @@ def get_langchain_language(language: Language):
7677
return text_splitter.Language.HASKELL
7778
elif language == Language.RUBY:
7879
return text_splitter.Language.RUBY
80+
elif language == Language.PHP:
81+
return text_splitter.Language.PHP
7982
else:
8083
return None
8184

0 commit comments

Comments
 (0)