diff --git a/.gitignore b/.gitignore index 93fc9385d..f3d47be72 100644 --- a/.gitignore +++ b/.gitignore @@ -183,6 +183,8 @@ cython_debug/ # Datajoint related files dj_local_conf.json +datajoint.json +.secrets/ *.env !.vscode/launch.json # pixi environments diff --git a/docs/mkdocs.yaml b/docs/mkdocs.yaml index db2ea16f9..3aef00bad 100644 --- a/docs/mkdocs.yaml +++ b/docs/mkdocs.yaml @@ -7,6 +7,8 @@ repo_name: datajoint/datajoint-python nav: - Home: index.md - Contributing: develop.md + - How-To Guides: + - Deferred Schema Activation: how-to/deferred-schema-activation.md - Architecture: - architecture/index.md - SQL Transpilation: architecture/transpilation.md diff --git a/docs/src/archive/tutorials/dj-top.ipynb b/docs/src/archive/tutorials/dj-top.ipynb deleted file mode 100644 index 5920a9f25..000000000 --- a/docs/src/archive/tutorials/dj-top.ipynb +++ /dev/null @@ -1,1015 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Using the dj.Top restriction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First you will need to [install](../../getting-started/#installation) and [connect](../../getting-started/#connection) to a DataJoint [data pipeline](https://docs.datajoint.com/core/datajoint-python/latest/concepts/data-pipelines/#what-is-a-data-pipeline).\n", - "\n", - "Now let's start by importing the `datajoint` client." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2024-12-20 11:10:20,120][INFO]: Connecting root@127.0.0.1:3306\n", - "[2024-12-20 11:10:20,259][INFO]: Connected root@127.0.0.1:3306\n" - ] - } - ], - "source": [ - "import datajoint as dj\n", - "\n", - "dj.config[\"database.host\"] = \"127.0.0.1\"\n", - "schema = dj.Schema(\"university\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Table Definition" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "@schema\n", - "class Student(dj.Manual):\n", - " definition = \"\"\"\n", - " student_id : int unsigned # university-wide ID number\n", - " ---\n", - " first_name : varchar(40)\n", - " last_name : varchar(40)\n", - " sex : enum('F', 'M', 'U')\n", - " date_of_birth : date\n", - " home_address : varchar(120) # mailing street address\n", - " home_city : varchar(60) # mailing address\n", - " home_state : char(2) # US state acronym: e.g. OH\n", - " home_zip : char(10) # zipcode e.g. 93979-4979\n", - " home_phone : varchar(20) # e.g. 414.657.6883x0881\n", - " \"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "@schema\n", - "class Department(dj.Manual):\n", - " definition = \"\"\"\n", - " dept : varchar(6) # abbreviated department name, e.g. BIOL\n", - " ---\n", - " dept_name : varchar(200) # full department name\n", - " dept_address : varchar(200) # mailing address\n", - " dept_phone : varchar(20)\n", - " \"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "@schema\n", - "class StudentMajor(dj.Manual):\n", - " definition = \"\"\"\n", - " -> Student\n", - " ---\n", - " -> Department\n", - " declare_date : date # when student declared her major\n", - " \"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "@schema\n", - "class Course(dj.Manual):\n", - " definition = \"\"\"\n", - " -> Department\n", - " course : int unsigned # course number, e.g. 1010\n", - " ---\n", - " course_name : varchar(200) # e.g. \"Neurobiology of Sensation and Movement.\"\n", - " credits : decimal(3,1) # number of credits earned by completing the course\n", - " \"\"\"\n", - "\n", - "\n", - "@schema\n", - "class Term(dj.Manual):\n", - " definition = \"\"\"\n", - " term_year : year\n", - " term : enum('Spring', 'Summer', 'Fall')\n", - " \"\"\"\n", - "\n", - "\n", - "@schema\n", - "class Section(dj.Manual):\n", - " definition = \"\"\"\n", - " -> Course\n", - " -> Term\n", - " section : char(1)\n", - " ---\n", - " auditorium : varchar(12)\n", - " \"\"\"\n", - "\n", - "\n", - "@schema\n", - "class CurrentTerm(dj.Manual):\n", - " definition = \"\"\"\n", - " -> Term\n", - " \"\"\"\n", - "\n", - "\n", - "@schema\n", - "class Enroll(dj.Manual):\n", - " definition = \"\"\"\n", - " -> Student\n", - " -> Section\n", - " \"\"\"\n", - "\n", - "\n", - "@schema\n", - "class LetterGrade(dj.Lookup):\n", - " definition = \"\"\"\n", - " grade : char(2)\n", - " ---\n", - " points : decimal(3,2)\n", - " \"\"\"\n", - " contents = [\n", - " [\"A\", 4.00],\n", - " [\"A-\", 3.67],\n", - " [\"B+\", 3.33],\n", - " [\"B\", 3.00],\n", - " [\"B-\", 2.67],\n", - " [\"C+\", 2.33],\n", - " [\"C\", 2.00],\n", - " [\"C-\", 1.67],\n", - " [\"D+\", 1.33],\n", - " [\"D\", 1.00],\n", - " [\"F\", 0.00],\n", - " ]\n", - "\n", - "\n", - "@schema\n", - "class Grade(dj.Manual):\n", - " definition = \"\"\"\n", - " -> Enroll\n", - " ---\n", - " -> LetterGrade\n", - " \"\"\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Insert" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "from tqdm import tqdm\n", - "import faker\n", - "import random\n", - "import datetime\n", - "\n", - "fake = faker.Faker()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "def yield_students():\n", - " fake_name = {\"F\": fake.name_female, \"M\": fake.name_male}\n", - " while True: # ignore invalid values\n", - " try:\n", - " sex = random.choice((\"F\", \"M\"))\n", - " first_name, last_name = fake_name[sex]().split(\" \")[:2]\n", - " street_address, city = fake.address().split(\"\\n\")\n", - " city, state = city.split(\", \")\n", - " state, zipcode = state.split(\" \")\n", - " except ValueError:\n", - " continue\n", - " else:\n", - " yield dict(\n", - " first_name=first_name,\n", - " last_name=last_name,\n", - " sex=sex,\n", - " home_address=street_address,\n", - " home_city=city,\n", - " home_state=state,\n", - " home_zip=zipcode,\n", - " date_of_birth=str(fake.date_time_between(start_date=\"-35y\", end_date=\"-15y\").date()),\n", - " home_phone=fake.phone_number()[:20],\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "Student.insert(dict(k, student_id=i) for i, k in zip(range(100, 300), yield_students()))\n", - "\n", - "Department.insert(\n", - " dict(\n", - " dept=dept,\n", - " dept_name=name,\n", - " dept_address=fake.address(),\n", - " dept_phone=fake.phone_number()[:20],\n", - " )\n", - " for dept, name in [\n", - " [\"CS\", \"Computer Science\"],\n", - " [\"BIOL\", \"Life Sciences\"],\n", - " [\"PHYS\", \"Physics\"],\n", - " [\"MATH\", \"Mathematics\"],\n", - " ]\n", - ")\n", - "\n", - "StudentMajor.insert(\n", - " {**s, **d, \"declare_date\": fake.date_between(start_date=datetime.date(1999, 1, 1))}\n", - " for s, d in zip(Student.fetch(\"KEY\"), random.choices(Department.fetch(\"KEY\"), k=len(Student())))\n", - " if random.random() < 0.75\n", - ")\n", - "\n", - "# from https://www.utah.edu/\n", - "Course.insert(\n", - " [\n", - " [\"BIOL\", 1006, \"World of Dinosaurs\", 3],\n", - " [\"BIOL\", 1010, \"Biology in the 21st Century\", 3],\n", - " [\"BIOL\", 1030, \"Human Biology\", 3],\n", - " [\"BIOL\", 1210, \"Principles of Biology\", 4],\n", - " [\"BIOL\", 2010, \"Evolution & Diversity of Life\", 3],\n", - " [\"BIOL\", 2020, \"Principles of Cell Biology\", 3],\n", - " [\"BIOL\", 2021, \"Principles of Cell Science\", 4],\n", - " [\"BIOL\", 2030, \"Principles of Genetics\", 3],\n", - " [\"BIOL\", 2210, \"Human Genetics\", 3],\n", - " [\"BIOL\", 2325, \"Human Anatomy\", 4],\n", - " [\"BIOL\", 2330, \"Plants & Society\", 3],\n", - " [\"BIOL\", 2355, \"Field Botany\", 2],\n", - " [\"BIOL\", 2420, \"Human Physiology\", 4],\n", - " [\"PHYS\", 2040, \"Classcal Theoretical Physics II\", 4],\n", - " [\"PHYS\", 2060, \"Quantum Mechanics\", 3],\n", - " [\"PHYS\", 2100, \"General Relativity and Cosmology\", 3],\n", - " [\"PHYS\", 2140, \"Statistical Mechanics\", 4],\n", - " [\"PHYS\", 2210, \"Physics for Scientists and Engineers I\", 4],\n", - " [\"PHYS\", 2220, \"Physics for Scientists and Engineers II\", 4],\n", - " [\"PHYS\", 3210, \"Physics for Scientists I (Honors)\", 4],\n", - " [\"PHYS\", 3220, \"Physics for Scientists II (Honors)\", 4],\n", - " [\"MATH\", 1250, \"Calculus for AP Students I\", 4],\n", - " [\"MATH\", 1260, \"Calculus for AP Students II\", 4],\n", - " [\"MATH\", 1210, \"Calculus I\", 4],\n", - " [\"MATH\", 1220, \"Calculus II\", 4],\n", - " [\"MATH\", 2210, \"Calculus III\", 3],\n", - " [\"MATH\", 2270, \"Linear Algebra\", 4],\n", - " [\"MATH\", 2280, \"Introduction to Differential Equations\", 4],\n", - " [\"MATH\", 3210, \"Foundations of Analysis I\", 4],\n", - " [\"MATH\", 3220, \"Foundations of Analysis II\", 4],\n", - " [\"CS\", 1030, \"Foundations of Computer Science\", 3],\n", - " [\"CS\", 1410, \"Introduction to Object-Oriented Programming\", 4],\n", - " [\"CS\", 2420, \"Introduction to Algorithms & Data Structures\", 4],\n", - " [\"CS\", 2100, \"Discrete Structures\", 3],\n", - " [\"CS\", 3500, \"Software Practice\", 4],\n", - " [\"CS\", 3505, \"Software Practice II\", 3],\n", - " [\"CS\", 3810, \"Computer Organization\", 4],\n", - " [\"CS\", 4400, \"Computer Systems\", 4],\n", - " [\"CS\", 4150, \"Algorithms\", 3],\n", - " [\"CS\", 3100, \"Models of Computation\", 3],\n", - " [\"CS\", 3200, \"Introduction to Scientific Computing\", 3],\n", - " [\"CS\", 4000, \"Senior Capstone Project - Design Phase\", 3],\n", - " [\"CS\", 4500, \"Senior Capstone Project\", 3],\n", - " [\"CS\", 4940, \"Undergraduate Research\", 3],\n", - " [\"CS\", 4970, \"Computer Science Bachelors Thesis\", 3],\n", - " ]\n", - ")\n", - "\n", - "Term.insert(dict(term_year=year, term=term) for year in range(1999, 2019) for term in [\"Spring\", \"Summer\", \"Fall\"])\n", - "\n", - "Term().fetch(order_by=(\"term_year DESC\", \"term DESC\"), as_dict=True, limit=1)[0]\n", - "\n", - "CurrentTerm().insert1({**Term().fetch(order_by=(\"term_year DESC\", \"term DESC\"), as_dict=True, limit=1)[0]})\n", - "\n", - "\n", - "def make_section(prob):\n", - " for c in (Course * Term).proj():\n", - " for sec in \"abcd\":\n", - " if random.random() < prob:\n", - " break\n", - " yield {\n", - " **c,\n", - " \"section\": sec,\n", - " \"auditorium\": random.choice(\"ABCDEF\") + str(random.randint(1, 100)),\n", - " }\n", - "\n", - "\n", - "Section.insert(make_section(0.5))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 200/200 [00:27<00:00, 7.17it/s]\n" - ] - } - ], - "source": [ - "# Enrollment\n", - "terms = Term().fetch(\"KEY\")\n", - "quit_prob = 0.1\n", - "for student in tqdm(Student.fetch(\"KEY\")):\n", - " start_term = random.randrange(len(terms))\n", - " for term in terms[start_term:]:\n", - " if random.random() < quit_prob:\n", - " break\n", - " else:\n", - " sections = ((Section & term) - (Course & (Enroll & student))).fetch(\"KEY\")\n", - " if sections:\n", - " Enroll.insert(\n", - " {**student, **section} for section in random.sample(sections, random.randrange(min(5, len(sections))))\n", - " )\n", - "\n", - "# assign random grades\n", - "grades = LetterGrade.fetch(\"grade\")\n", - "\n", - "grade_keys = Enroll.fetch(\"KEY\")\n", - "random.shuffle(grade_keys)\n", - "grade_keys = grade_keys[: len(grade_keys) * 9 // 10]\n", - "\n", - "Grade.insert({**key, \"grade\": grade} for key, grade in zip(grade_keys, random.choices(grades, k=len(grade_keys))))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# dj.Top Restriction" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

student_id

\n", - " university-wide ID number\n", - "
\n", - "

dept

\n", - " abbreviated department name, e.g. BIOL\n", - "
\n", - "

course

\n", - " course number, e.g. 1010\n", - "
\n", - "

term_year

\n", - " \n", - "
\n", - "

term

\n", - " \n", - "
\n", - "

section

\n", - " \n", - "
\n", - "

grade

\n", - " \n", - "
\n", - "

points

\n", - " \n", - "
100MATH22802018FallaA-3.67
191MATH22102018SpringbA4.00
211CS21002018FallaA4.00
273PHYS21002018SpringaA4.00
282BIOL20212018SpringdA4.00
\n", - " \n", - "

Total: 5

\n", - " " - ], - "text/plain": [ - "*student_id *dept *course *term_year *term *section *grade points \n", - "+------------+ +------+ +--------+ +-----------+ +--------+ +---------+ +-------+ +--------+\n", - "100 MATH 2280 2018 Fall a A- 3.67 \n", - "191 MATH 2210 2018 Spring b A 4.00 \n", - "211 CS 2100 2018 Fall a A 4.00 \n", - "273 PHYS 2100 2018 Spring a A 4.00 \n", - "282 BIOL 2021 2018 Spring d A 4.00 \n", - " (Total: 5)" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(Grade * LetterGrade) & \"term_year='2018'\" & dj.Top(limit=5, order_by=\"points DESC\", offset=5)" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"SELECT `grade`,`student_id`,`dept`,`course`,`term_year`,`term`,`section`,`points` FROM `university`.`#letter_grade` NATURAL JOIN `university`.`grade` WHERE ( (term_year='2018')) ORDER BY `points` DESC LIMIT 10\"" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "((LetterGrade * Grade) & \"term_year='2018'\" & dj.Top(limit=10, order_by=\"points DESC\", offset=0)).make_sql()" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"SELECT `student_id`,`dept`,`course`,`term_year`,`term`,`section`,`grade`,`points` FROM `university`.`grade` NATURAL JOIN `university`.`#letter_grade` WHERE ( (term_year='2018')) ORDER BY `points` DESC LIMIT 20\"" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "((Grade * LetterGrade) & \"term_year='2018'\" & dj.Top(limit=20, order_by=\"points DESC\", offset=0)).make_sql()" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

student_id

\n", - " university-wide ID number\n", - "
\n", - "

dept

\n", - " abbreviated department name, e.g. BIOL\n", - "
\n", - "

course

\n", - " course number, e.g. 1010\n", - "
\n", - "

term_year

\n", - " \n", - "
\n", - "

term

\n", - " \n", - "
\n", - "

section

\n", - " \n", - "
\n", - "

grade

\n", - " \n", - "
\n", - "

points

\n", - " \n", - "
100CS32002018FallcA4.00
100MATH22802018FallaA-3.67
100PHYS22102018SpringdA4.00
122CS10302018FallcB+3.33
131BIOL20302018SpringaA4.00
131CS32002018FallbB+3.33
136BIOL22102018SpringcB+3.33
136MATH22102018FallbB+3.33
141BIOL20102018SummercB+3.33
141CS24202018FallbA4.00
141CS32002018FallbA-3.67
182CS14102018SummercA-3.67
\n", - "

...

\n", - "

Total: 20

\n", - " " - ], - "text/plain": [ - "*student_id *dept *course *term_year *term *section *grade points \n", - "+------------+ +------+ +--------+ +-----------+ +--------+ +---------+ +-------+ +--------+\n", - "100 CS 3200 2018 Fall c A 4.00 \n", - "100 MATH 2280 2018 Fall a A- 3.67 \n", - "100 PHYS 2210 2018 Spring d A 4.00 \n", - "122 CS 1030 2018 Fall c B+ 3.33 \n", - "131 BIOL 2030 2018 Spring a A 4.00 \n", - "131 CS 3200 2018 Fall b B+ 3.33 \n", - "136 BIOL 2210 2018 Spring c B+ 3.33 \n", - "136 MATH 2210 2018 Fall b B+ 3.33 \n", - "141 BIOL 2010 2018 Summer c B+ 3.33 \n", - "141 CS 2420 2018 Fall b A 4.00 \n", - "141 CS 3200 2018 Fall b A- 3.67 \n", - "182 CS 1410 2018 Summer c A- 3.67 \n", - " ...\n", - " (Total: 20)" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(Grade * LetterGrade) & \"term_year='2018'\" & dj.Top(limit=20, order_by=\"points DESC\", offset=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

grade

\n", - " \n", - "
\n", - "

student_id

\n", - " university-wide ID number\n", - "
\n", - "

dept

\n", - " abbreviated department name, e.g. BIOL\n", - "
\n", - "

course

\n", - " course number, e.g. 1010\n", - "
\n", - "

term_year

\n", - " \n", - "
\n", - "

term

\n", - " \n", - "
\n", - "

section

\n", - " \n", - "
\n", - "

points

\n", - " \n", - "
A100CS32002018Fallc4.00
A100PHYS22102018Springd4.00
A131BIOL20302018Springa4.00
A141CS24202018Fallb4.00
A186PHYS22102018Springa4.00
A191MATH22102018Springb4.00
A211CS21002018Falla4.00
A273PHYS21002018Springa4.00
A282BIOL20212018Springd4.00
A-100MATH22802018Falla3.67
A-141CS32002018Fallb3.67
A-182CS14102018Summerc3.67
\n", - "

...

\n", - "

Total: 20

\n", - " " - ], - "text/plain": [ - "*grade *student_id *dept *course *term_year *term *section points \n", - "+-------+ +------------+ +------+ +--------+ +-----------+ +--------+ +---------+ +--------+\n", - "A 100 CS 3200 2018 Fall c 4.00 \n", - "A 100 PHYS 2210 2018 Spring d 4.00 \n", - "A 131 BIOL 2030 2018 Spring a 4.00 \n", - "A 141 CS 2420 2018 Fall b 4.00 \n", - "A 186 PHYS 2210 2018 Spring a 4.00 \n", - "A 191 MATH 2210 2018 Spring b 4.00 \n", - "A 211 CS 2100 2018 Fall a 4.00 \n", - "A 273 PHYS 2100 2018 Spring a 4.00 \n", - "A 282 BIOL 2021 2018 Spring d 4.00 \n", - "A- 100 MATH 2280 2018 Fall a 3.67 \n", - "A- 141 CS 3200 2018 Fall b 3.67 \n", - "A- 182 CS 1410 2018 Summer c 3.67 \n", - " ...\n", - " (Total: 20)" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(LetterGrade * Grade) & \"term_year='2018'\" & dj.Top(limit=20, order_by=\"points DESC\", offset=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "elements", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/src/archive/tutorials/json.ipynb b/docs/src/archive/tutorials/json.ipynb deleted file mode 100644 index 9c5feebf6..000000000 --- a/docs/src/archive/tutorials/json.ipynb +++ /dev/null @@ -1,1080 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "id": "7fe24127-c0d0-4ff8-96b4-6ab0d9307e73", - "metadata": {}, - "source": [ - "# Using the json type" - ] - }, - { - "cell_type": "markdown", - "id": "62450023", - "metadata": {}, - "source": [ - "> ⚠️ Note the following before using the `json` type\n", - "> - Supported only for MySQL >= 8.0 when [JSON_VALUE](https://dev.mysql.com/doc/refman/8.0/en/json-search-functions.html#function_json-value) introduced.\n", - "> - Equivalent Percona is fully-compatible.\n", - "> - MariaDB is not supported since [JSON_VALUE](https://mariadb.com/kb/en/json_value/#syntax) does not allow type specification like MySQL's.\n", - "> - Not yet supported in DataJoint MATLAB" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "67cf93d2", - "metadata": {}, - "source": [ - "First you will need to [install](../../getting-started/#installation) and [connect](../../getting-started/#connection) to a DataJoint [data pipeline](https://docs.datajoint.com/core/datajoint-python/latest/concepts/data-pipelines/#what-is-a-data-pipeline).\n", - "\n", - "Now let's start by importing the `datajoint` client." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "bc0b6f54-8f11-45f4-bf8d-e1058ee0056f", - "metadata": {}, - "outputs": [], - "source": [ - "import datajoint as dj" - ] - }, - { - "cell_type": "markdown", - "id": "3544cab9-f2db-458a-9431-939bea5affc5", - "metadata": {}, - "source": [ - "## Table Definition" - ] - }, - { - "cell_type": "markdown", - "id": "a2998c71", - "metadata": {}, - "source": [ - "For this exercise, let's imagine we work for an awesome company that is organizing a fun RC car race across various teams in the company. Let's see which team has the fastest car! 🏎️\n", - "\n", - "This establishes 2 important entities: a `Team` and a `Car`. Normally the entities are mapped to their own dedicated table, however, let's assume that `Team` is well-structured but `Car` is less structured than we'd prefer. In other words, the structure for what makes up a *car* is varying too much between entries (perhaps because users of the pipeline haven't agreed yet on the definition? 🤷).\n", - "\n", - "This would make it a good use-case to keep `Team` as a table but make `Car` a `json` type defined within the `Team` table.\n", - "\n", - "Let's begin." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "dc318298-b819-4f06-abbd-7bb7544dd431", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2023-02-12 00:14:33,027][INFO]: Connecting root@fakeservices.datajoint.io:3306\n", - "[2023-02-12 00:14:33,039][INFO]: Connected root@fakeservices.datajoint.io:3306\n" - ] - } - ], - "source": [ - "schema = dj.Schema(f\"{dj.config['database.user']}_json\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "4aaf96db-85d9-4e94-a4c3-3558f4cc6671", - "metadata": {}, - "outputs": [], - "source": [ - "@schema\n", - "class Team(dj.Lookup):\n", - " definition = \"\"\"\n", - " # A team within a company\n", - " name: varchar(40) # team name\n", - " ---\n", - " car=null: json # A car belonging to a team (null to allow registering first but specifying car later)\n", - " \n", - " unique index(car.length:decimal(4, 1)) # Add an index if this key is frequently accessed\n", - " \"\"\"" - ] - }, - { - "cell_type": "markdown", - "id": "640bf7a7-9e07-4953-9c8a-304e55c467f8", - "metadata": {}, - "source": [ - "## Insert" - ] - }, - { - "cell_type": "markdown", - "id": "7081e577", - "metadata": {}, - "source": [ - "Let's suppose that engineering is first up to register their car." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "30f0d62e", - "metadata": {}, - "outputs": [], - "source": [ - "Team.insert1(\n", - " {\n", - " \"name\": \"engineering\",\n", - " \"car\": {\n", - " \"name\": \"Rever\",\n", - " \"length\": 20.5,\n", - " \"inspected\": True,\n", - " \"tire_pressure\": [32, 31, 33, 34],\n", - " \"headlights\": [\n", - " {\n", - " \"side\": \"left\",\n", - " \"hyper_white\": None,\n", - " },\n", - " {\n", - " \"side\": \"right\",\n", - " \"hyper_white\": None,\n", - " },\n", - " ],\n", - " },\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ee5e4dcf", - "metadata": {}, - "source": [ - "Next, business and marketing teams are up and register their cars.\n", - "\n", - "A few points to notice below:\n", - "- The person signing up on behalf of marketing does not know the specifics of the car during registration but another team member will be updating this soon before the race.\n", - "- Notice how the `business` and `engineering` teams appear to specify the same property but refer to it as `safety_inspected` and `inspected` respectfully." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "b532e16c", - "metadata": {}, - "outputs": [], - "source": [ - "Team.insert(\n", - " [\n", - " {\n", - " \"name\": \"marketing\",\n", - " \"car\": None,\n", - " },\n", - " {\n", - " \"name\": \"business\",\n", - " \"car\": {\n", - " \"name\": \"Chaching\",\n", - " \"length\": 100,\n", - " \"safety_inspected\": False,\n", - " \"tire_pressure\": [34, 30, 27, 32],\n", - " \"headlights\": [\n", - " {\n", - " \"side\": \"left\",\n", - " \"hyper_white\": True,\n", - " },\n", - " {\n", - " \"side\": \"right\",\n", - " \"hyper_white\": True,\n", - " },\n", - " ],\n", - " },\n", - " },\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "57365de7", - "metadata": {}, - "source": [ - "We can preview the table data much like normal but notice how the value of `car` behaves like other BLOB-like attributes." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "0e3b517c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " A team within a company\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "
\n", - "

name

\n", - " team name\n", - "
\n", - "

car

\n", - " A car belonging to a team (null to allow registering first but specifying car later)\n", - "
marketing=BLOB=
engineering=BLOB=
business=BLOB=
\n", - " \n", - "

Total: 3

\n", - " " - ], - "text/plain": [ - "*name car \n", - "+------------+ +--------+\n", - "marketing =BLOB= \n", - "engineering =BLOB= \n", - "business =BLOB= \n", - " (Total: 3)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Team()" - ] - }, - { - "cell_type": "markdown", - "id": "c95cbbee-4ef7-4870-ad42-a60345a3644f", - "metadata": {}, - "source": [ - "## Restriction" - ] - }, - { - "cell_type": "markdown", - "id": "8b454996", - "metadata": {}, - "source": [ - "Now let's see what kinds of queries we can form to demostrate how we can query this pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "81efda24", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " A team within a company\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "
\n", - "

name

\n", - " team name\n", - "
\n", - "

car

\n", - " A car belonging to a team (null to allow registering first but specifying car later)\n", - "
business=BLOB=
\n", - " \n", - "

Total: 1

\n", - " " - ], - "text/plain": [ - "*name car \n", - "+----------+ +--------+\n", - "business =BLOB= \n", - " (Total: 1)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Which team has a `car` equal to 100 inches long?\n", - "Team & {\"car.length\": 100}" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "fd7b855d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " A team within a company\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "
\n", - "

name

\n", - " team name\n", - "
\n", - "

car

\n", - " A car belonging to a team (null to allow registering first but specifying car later)\n", - "
engineering=BLOB=
\n", - " \n", - "

Total: 1

\n", - " " - ], - "text/plain": [ - "*name car \n", - "+------------+ +--------+\n", - "engineering =BLOB= \n", - " (Total: 1)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Which team has a `car` less than 50 inches long?\n", - "Team & \"car->>'$.length' < 50\"" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "b76ebb75", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " A team within a company\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "
\n", - "

name

\n", - " team name\n", - "
\n", - "

car

\n", - " A car belonging to a team (null to allow registering first but specifying car later)\n", - "
engineering=BLOB=
\n", - " \n", - "

Total: 1

\n", - " " - ], - "text/plain": [ - "*name car \n", - "+------------+ +--------+\n", - "engineering =BLOB= \n", - " (Total: 1)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Any team that has had their car inspected?\n", - "Team & [{\"car.inspected:unsigned\": True}, {\"car.safety_inspected:unsigned\": True}]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "b787784c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " A team within a company\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "
\n", - "

name

\n", - " team name\n", - "
\n", - "

car

\n", - " A car belonging to a team (null to allow registering first but specifying car later)\n", - "
engineering=BLOB=
marketing=BLOB=
\n", - " \n", - "

Total: 2

\n", - " " - ], - "text/plain": [ - "*name car \n", - "+------------+ +--------+\n", - "engineering =BLOB= \n", - "marketing =BLOB= \n", - " (Total: 2)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Which teams do not have hyper white lights for their first head light?\n", - "Team & {\"car.headlights[0].hyper_white\": None}" - ] - }, - { - "cell_type": "markdown", - "id": "5bcf0b5d", - "metadata": {}, - "source": [ - "Notice that the previous query will satisfy the `None` check if it experiences any of the following scenarious:\n", - "- if entire record missing (`marketing` satisfies this)\n", - "- JSON key is missing\n", - "- JSON value is set to JSON `null` (`engineering` satisfies this)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "bcf1682e-a0c7-4c2f-826b-0aec9052a694", - "metadata": {}, - "source": [ - "## Projection" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "daea110e", - "metadata": {}, - "source": [ - "Projections can be quite useful with the `json` type since we can extract out just what we need. This allows greater query flexibility but more importantly, for us to be able to fetch only what is pertinent." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "8fb8334a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

name

\n", - " team name\n", - "
\n", - "

car_name

\n", - " calculated attribute\n", - "
\n", - "

car_length

\n", - " calculated attribute\n", - "
businessChaching100
engineeringRever20.5
marketingNoneNone
\n", - " \n", - "

Total: 3

\n", - " " - ], - "text/plain": [ - "*name car_name car_length \n", - "+------------+ +----------+ +------------+\n", - "business Chaching 100 \n", - "engineering Rever 20.5 \n", - "marketing None None \n", - " (Total: 3)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Only interested in the car names and the length but let the type be inferred\n", - "q_untyped = Team.proj(\n", - " car_name=\"car.name\",\n", - " car_length=\"car.length\",\n", - ")\n", - "q_untyped" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "bb5f0448", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'name': 'business', 'car_name': 'Chaching', 'car_length': '100'},\n", - " {'name': 'engineering', 'car_name': 'Rever', 'car_length': '20.5'},\n", - " {'name': 'marketing', 'car_name': None, 'car_length': None}]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "q_untyped.fetch(as_dict=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "a307dfd7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

name

\n", - " team name\n", - "
\n", - "

car_name

\n", - " calculated attribute\n", - "
\n", - "

car_length

\n", - " calculated attribute\n", - "
businessChaching100.0
engineeringRever20.5
marketingNoneNone
\n", - " \n", - "

Total: 3

\n", - " " - ], - "text/plain": [ - "*name car_name car_length \n", - "+------------+ +----------+ +------------+\n", - "business Chaching 100.0 \n", - "engineering Rever 20.5 \n", - "marketing None None \n", - " (Total: 3)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Nevermind, I'll specify the type explicitly\n", - "q_typed = Team.proj(\n", - " car_name=\"car.name\",\n", - " car_length=\"car.length:float\",\n", - ")\n", - "q_typed" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "8a93dbf9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'name': 'business', 'car_name': 'Chaching', 'car_length': 100.0},\n", - " {'name': 'engineering', 'car_name': 'Rever', 'car_length': 20.5},\n", - " {'name': 'marketing', 'car_name': None, 'car_length': None}]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "q_typed.fetch(as_dict=True)" - ] - }, - { - "cell_type": "markdown", - "id": "62dd0239-fa70-4369-81eb-3d46c5053fee", - "metadata": {}, - "source": [ - "## Describe" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "73d9df01", - "metadata": {}, - "source": [ - "Lastly, the `.describe()` function on the `Team` table can help us generate the table's definition. This is useful if we are connected directly to the pipeline without the original source." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "0e739932", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# A team within a company\n", - "name : varchar(40) # team name\n", - "---\n", - "car=null : json # A car belonging to a team (null to allow registering first but specifying car later)\n", - "UNIQUE INDEX ((json_value(`car`, _utf8mb4'$.length' returning decimal(4, 1))))\n", - "\n" - ] - } - ], - "source": [ - "rebuilt_definition = Team.describe()\n", - "print(rebuilt_definition)" - ] - }, - { - "cell_type": "markdown", - "id": "be1070d5-765b-4bc2-92de-8a6ffd885984", - "metadata": {}, - "source": [ - "## Cleanup" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "cb959927", - "metadata": {}, - "source": [ - "Finally, let's clean up what we created in this tutorial." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d9cc28a3-3ffd-4126-b7e9-bc6365040b93", - "metadata": {}, - "outputs": [], - "source": [ - "schema.drop()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "68ad4340", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "all_purposes", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/src/how-to/deferred-schema-activation.md b/docs/src/how-to/deferred-schema-activation.md new file mode 100644 index 000000000..900d3be13 --- /dev/null +++ b/docs/src/how-to/deferred-schema-activation.md @@ -0,0 +1,151 @@ +# Deferred Schema Activation + +Define table classes without an immediate database connection, then activate +the schema later when ready to connect. + +## When to use deferred activation + +Deferred schema activation is useful when you want to: + +- Define reusable table modules that work with different databases +- Write testable code where the database connection is injected at runtime +- Deploy the same pipeline to multiple environments (development, staging, + production) +- Import table definitions without triggering database connections + +## Define tables without a database connection + +Create a schema object without providing a schema name: + +```python +import datajoint as dj + +# Create schema without activation +schema = dj.Schema() + +@schema +class Subject(dj.Manual): + definition = """ + subject_id : int + --- + subject_name : varchar(64) + """ + +@schema +class Session(dj.Manual): + definition = """ + -> Subject + session_date : date + --- + session_notes : varchar(256) + """ +``` + +The `@schema` decorator queues table classes for later declaration. No database +connection is made until you call `activate()`. + +## Check activation status + +To check whether a schema has been activated: + +```python +schema.is_activated() # Returns False before activation +``` + +## Activate the schema + +When ready to connect, call `activate()` with the database schema name: + +```python +schema.activate('my_project') +``` + +This: + +1. Connects to the database (using `dj.conn()` by default) +2. Creates the schema if it doesn't exist +3. Declares all queued tables in the order they were decorated + +## Activate with a specific connection + +To use a specific database connection: + +```python +connection = dj.conn( + host='production-server.example.com', + user='pipeline_user', + password='secret' +) + +schema.activate('my_project', connection=connection) +``` + +## Activate with options + +Control schema and table creation behavior: + +```python +# Connect to existing schema only (don't create if missing) +schema.activate('my_project', create_schema=False) + +# Don't create tables automatically +schema.activate('my_project', create_tables=False) +``` + +## Example: environment-based activation + +```python +# pipeline/tables.py +import datajoint as dj + +schema = dj.Schema() + +@schema +class Experiment(dj.Manual): + definition = """ + experiment_id : int + --- + experiment_date : date + """ + +# pipeline/activate.py +import os +from pipeline.tables import schema + +env = os.environ.get('ENVIRONMENT', 'development') + +schema_names = { + 'development': 'dev_experiments', + 'staging': 'staging_experiments', + 'production': 'prod_experiments', +} + +schema.activate(schema_names[env]) +``` + +## Example: test fixtures + +```python +import pytest +import datajoint as dj +from mypackage.tables import schema, Subject, Session + +@pytest.fixture +def test_schema(db_credentials): + """Activate schema with test database.""" + schema.activate( + 'test_pipeline', + connection=dj.conn(**db_credentials) + ) + yield schema + schema.drop() # Clean up after tests +``` + +## Restrictions + +- A schema can only be activated once. Attempting to activate for a different + database raises `DataJointError`. +- Calling `activate()` without a schema name on an unactivated schema raises + `DataJointError`. +- Part tables should not be decorated directly; they are processed automatically + with their master table. diff --git a/src/datajoint/objectref.py b/src/datajoint/objectref.py index 9a049b2cf..d318fe57e 100644 --- a/src/datajoint/objectref.py +++ b/src/datajoint/objectref.py @@ -44,7 +44,7 @@ class ObjectRef: ext: File extension as tooling hint (e.g., ".dat", ".zarr") or None. This is a conventional suffix for tooling, not a content-type declaration. is_dir: True if stored content is a directory/key-prefix (e.g., Zarr store) - timestamp: ISO 8601 upload timestamp + timestamp: ISO 8601 upload timestamp, or None if not recorded mime_type: MIME type (files only, auto-detected from extension) item_count: Number of files (folders only), or None if not computed """ @@ -54,7 +54,7 @@ class ObjectRef: hash: str | None ext: str | None is_dir: bool - timestamp: datetime + timestamp: datetime | None url: str | None = None store: str | None = None mime_type: str | None = None @@ -128,32 +128,6 @@ def to_json(self) -> dict: data["item_count"] = self.item_count return data - def to_dict(self) -> dict: - """ - Return the raw JSON metadata as a dictionary. - - This is useful for inspecting the stored metadata without triggering - any storage backend operations. The returned dict matches the JSON - structure stored in the database. - - Returns - ------- - dict - Dict containing the object metadata: - - - path: Relative storage path within the store - - url: Full URI (e.g., 's3://bucket/path') (optional) - - store: Store name (optional, None for default store) - - size: File/folder size in bytes (or None) - - hash: Content hash (or None) - - ext: File extension (or None) - - is_dir: True if folder - - timestamp: Upload timestamp - - mime_type: MIME type (files only, optional) - - item_count: Number of files (folders only, optional) - """ - return self.to_json() - def _ensure_backend(self): """Ensure storage backend is available for I/O operations.""" if self._backend is None: diff --git a/src/datajoint/storage.py b/src/datajoint/storage.py index 6dacbd7ec..846228137 100644 --- a/src/datajoint/storage.py +++ b/src/datajoint/storage.py @@ -24,13 +24,13 @@ # Characters safe for use in filenames and URLs TOKEN_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" -# Supported remote URL protocols for copy insert -REMOTE_PROTOCOLS = ("s3://", "gs://", "gcs://", "az://", "abfs://", "http://", "https://") +# Supported URL protocols +URL_PROTOCOLS = ("file://", "s3://", "gs://", "gcs://", "az://", "abfs://", "http://", "https://") -def is_remote_url(path: str) -> bool: +def is_url(path: str) -> bool: """ - Check if a path is a remote URL. + Check if a path is a URL. Parameters ---------- @@ -40,21 +40,57 @@ def is_remote_url(path: str) -> bool: Returns ------- bool - True if path starts with a supported remote protocol. + True if path starts with a supported URL protocol. """ - if not isinstance(path, str): - return False - return path.lower().startswith(REMOTE_PROTOCOLS) + return path.lower().startswith(URL_PROTOCOLS) -def parse_remote_url(url: str) -> tuple[str, str]: +def normalize_to_url(path: str) -> str: """ - Parse a remote URL into protocol and path. + Normalize a path to URL form. + + Converts local filesystem paths to file:// URLs. URLs are returned unchanged. + + Parameters + ---------- + path : str + Path string (local path or URL). + + Returns + ------- + str + URL form of the path. + + Examples + -------- + >>> normalize_to_url("/data/file.dat") + 'file:///data/file.dat' + >>> normalize_to_url("s3://bucket/key") + 's3://bucket/key' + >>> normalize_to_url("file:///already/url") + 'file:///already/url' + """ + if is_url(path): + return path + # Convert local path to file:// URL + # Ensure absolute path and proper format + abs_path = str(Path(path).resolve()) + # Handle Windows paths (C:\...) vs Unix paths (/...) + if abs_path.startswith("/"): + return f"file://{abs_path}" + else: + # Windows: file:///C:/path + return f"file:///{abs_path.replace(chr(92), '/')}" + + +def parse_url(url: str) -> tuple[str, str]: + """ + Parse a URL into protocol and path. Parameters ---------- url : str - Remote URL (e.g., ``'s3://bucket/path/file.dat'``). + URL (e.g., ``'s3://bucket/path/file.dat'`` or ``'file:///path/to/file'``). Returns ------- @@ -65,11 +101,19 @@ def parse_remote_url(url: str) -> tuple[str, str]: ------ DataJointError If URL protocol is not supported. + + Examples + -------- + >>> parse_url("s3://bucket/key/file.dat") + ('s3', 'bucket/key/file.dat') + >>> parse_url("file:///data/file.dat") + ('file', '/data/file.dat') """ url_lower = url.lower() # Map URL schemes to fsspec protocols protocol_map = { + "file://": "file", "s3://": "s3", "gs://": "gcs", "gcs://": "gcs", @@ -84,7 +128,7 @@ def parse_remote_url(url: str) -> tuple[str, str]: path = url[len(prefix) :] return protocol, path - raise errors.DataJointError(f"Unsupported remote URL protocol: {url}") + raise errors.DataJointError(f"Unsupported URL protocol: {url}") def generate_token(length: int = 8) -> str: @@ -358,6 +402,53 @@ def _full_path(self, path: str | PurePosixPath) -> str: return str(Path(location) / path) return path + def get_url(self, path: str | PurePosixPath) -> str: + """ + Get the full URL for a path in storage. + + Returns a consistent URL representation for any storage backend, + including file:// URLs for local filesystem. + + Parameters + ---------- + path : str or PurePosixPath + Relative path within the storage location. + + Returns + ------- + str + Full URL (e.g., 's3://bucket/path' or 'file:///data/path'). + + Examples + -------- + >>> backend = StorageBackend({"protocol": "file", "location": "/data"}) + >>> backend.get_url("schema/table/file.dat") + 'file:///data/schema/table/file.dat' + + >>> backend = StorageBackend({"protocol": "s3", "bucket": "mybucket", ...}) + >>> backend.get_url("schema/table/file.dat") + 's3://mybucket/schema/table/file.dat' + """ + full_path = self._full_path(path) + + if self.protocol == "file": + # Ensure absolute path for file:// URL + abs_path = str(Path(full_path).resolve()) + if abs_path.startswith("/"): + return f"file://{abs_path}" + else: + # Windows path + return f"file:///{abs_path.replace(chr(92), '/')}" + elif self.protocol == "s3": + return f"s3://{full_path}" + elif self.protocol == "gcs": + return f"gs://{full_path}" + elif self.protocol == "azure": + return f"az://{full_path}" + else: + # Fallback: use protocol prefix + return f"{self.protocol}://{full_path}" + def put_file(self, local_path: str | Path, remote_path: str | PurePosixPath, metadata: dict | None = None) -> None: """ Upload a file from local filesystem to storage. @@ -674,7 +765,7 @@ def copy_from_url(self, source_url: str, dest_path: str | PurePosixPath) -> int: int Size of copied file in bytes. """ - protocol, source_path = parse_remote_url(source_url) + protocol, source_path = parse_url(source_url) full_dest = self._full_path(dest_path) logger.debug(f"copy_from_url: {protocol}://{source_path} -> {self.protocol}:{full_dest}") @@ -774,8 +865,8 @@ def source_is_directory(self, source: str) -> bool: bool True if source is a directory. """ - if is_remote_url(source): - protocol, path = parse_remote_url(source) + if is_url(source): + protocol, path = parse_url(source) source_fs = fsspec.filesystem(protocol) return source_fs.isdir(path) else: @@ -795,8 +886,8 @@ def source_exists(self, source: str) -> bool: bool True if source exists. """ - if is_remote_url(source): - protocol, path = parse_remote_url(source) + if is_url(source): + protocol, path = parse_url(source) source_fs = fsspec.filesystem(protocol) return source_fs.exists(path) else: @@ -817,8 +908,8 @@ def get_source_size(self, source: str) -> int | None: Size in bytes, or None if directory or cannot determine. """ try: - if is_remote_url(source): - protocol, path = parse_remote_url(source) + if is_url(source): + protocol, path = parse_url(source) source_fs = fsspec.filesystem(protocol) if source_fs.isdir(path): return None diff --git a/src/datajoint/types.py b/src/datajoint/types.py index 72cefee3c..c8f6c7039 100644 --- a/src/datajoint/types.py +++ b/src/datajoint/types.py @@ -9,22 +9,16 @@ from __future__ import annotations -from typing import Any, TypeAlias +from typing import TYPE_CHECKING, Any, TypeAlias # Primary key types PrimaryKey: TypeAlias = dict[str, Any] """A dictionary mapping attribute names to values that uniquely identify an entity.""" -PrimaryKeyList: TypeAlias = list[dict[str, Any]] -"""A list of primary key dictionaries.""" - # Row/record types Row: TypeAlias = dict[str, Any] """A single row/record as a dictionary mapping attribute names to values.""" -RowList: TypeAlias = list[dict[str, Any]] -"""A list of rows/records.""" - # Attribute types AttributeName: TypeAlias = str """Name of a table attribute/column.""" @@ -47,7 +41,7 @@ """Mapping of child_attr -> (parent_table, parent_attr) for foreign keys.""" # Restriction types -Restriction: TypeAlias = str | dict[str, Any] | bool | "QueryExpression" | list | None +Restriction: TypeAlias = str | dict[str, Any] | bool | "QueryExpression" | list[Any] | None """Valid restriction types for query operations.""" # Fetch result types @@ -56,5 +50,5 @@ # For avoiding circular imports -if False: # TYPE_CHECKING equivalent that's always False +if TYPE_CHECKING: from .expression import QueryExpression diff --git a/tests/unit/test_storage_urls.py b/tests/unit/test_storage_urls.py new file mode 100644 index 000000000..649d695b2 --- /dev/null +++ b/tests/unit/test_storage_urls.py @@ -0,0 +1,121 @@ +"""Unit tests for storage URL functions.""" + +import pytest + +from datajoint.storage import ( + URL_PROTOCOLS, + is_url, + normalize_to_url, + parse_url, +) + + +class TestURLProtocols: + """Test URL protocol constants.""" + + def test_url_protocols_includes_file(self): + """URL_PROTOCOLS should include file://.""" + assert "file://" in URL_PROTOCOLS + + def test_url_protocols_includes_s3(self): + """URL_PROTOCOLS should include s3://.""" + assert "s3://" in URL_PROTOCOLS + + def test_url_protocols_includes_cloud_providers(self): + """URL_PROTOCOLS should include major cloud providers.""" + assert "gs://" in URL_PROTOCOLS + assert "az://" in URL_PROTOCOLS + + +class TestIsUrl: + """Test is_url function.""" + + def test_s3_url(self): + assert is_url("s3://bucket/key") + + def test_gs_url(self): + assert is_url("gs://bucket/key") + + def test_file_url(self): + assert is_url("file:///path/to/file") + + def test_http_url(self): + assert is_url("http://example.com/file") + + def test_https_url(self): + assert is_url("https://example.com/file") + + def test_local_path_not_url(self): + assert not is_url("/path/to/file") + + def test_relative_path_not_url(self): + assert not is_url("relative/path/file.dat") + + def test_case_insensitive(self): + assert is_url("S3://bucket/key") + assert is_url("FILE:///path") + + +class TestNormalizeToUrl: + """Test normalize_to_url function.""" + + def test_local_path_to_file_url(self): + url = normalize_to_url("/data/file.dat") + assert url.startswith("file://") + assert "data/file.dat" in url + + def test_s3_url_unchanged(self): + url = "s3://bucket/key/file.dat" + assert normalize_to_url(url) == url + + def test_file_url_unchanged(self): + url = "file:///data/file.dat" + assert normalize_to_url(url) == url + + def test_relative_path_becomes_absolute(self): + url = normalize_to_url("relative/path.dat") + assert url.startswith("file://") + # Should be absolute (contain full path) + assert "/" in url[7:] # After "file://" + + +class TestParseUrl: + """Test parse_url function.""" + + def test_parse_s3(self): + protocol, path = parse_url("s3://bucket/key/file.dat") + assert protocol == "s3" + assert path == "bucket/key/file.dat" + + def test_parse_gs(self): + protocol, path = parse_url("gs://bucket/key") + assert protocol == "gcs" + assert path == "bucket/key" + + def test_parse_gcs(self): + protocol, path = parse_url("gcs://bucket/key") + assert protocol == "gcs" + assert path == "bucket/key" + + def test_parse_file(self): + protocol, path = parse_url("file:///data/file.dat") + assert protocol == "file" + assert path == "/data/file.dat" + + def test_parse_http(self): + protocol, path = parse_url("http://example.com/file") + assert protocol == "http" + assert path == "example.com/file" + + def test_parse_https(self): + protocol, path = parse_url("https://example.com/file") + assert protocol == "https" + assert path == "example.com/file" + + def test_unsupported_protocol_raises(self): + with pytest.raises(Exception, match="Unsupported URL protocol"): + parse_url("ftp://example.com/file") + + def test_local_path_raises(self): + with pytest.raises(Exception, match="Unsupported URL protocol"): + parse_url("/local/path")