Skip to content
Draft
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
06057fc
derm treatment basic1
john-sanchez31 Aug 19, 2025
1fd4be7
derm treatment basic questions
john-sanchez31 Aug 20, 2025
170246b
defog dermtreatments adv questions
john-sanchez31 Aug 21, 2025
4564b1a
Merge branch 'main' into John/defogdbs
john-sanchez31 Aug 21, 2025
4270099
dermtreatment adv added
john-sanchez31 Aug 21, 2025
cedb935
defog dermtreatment adv questions and mysql defog tests
john-sanchez31 Aug 22, 2025
ba497ba
dermtreatments gen questions
john-sanchez31 Aug 22, 2025
c11a1ce
dermtreatments gen sql files
john-sanchez31 Aug 22, 2025
ffe50a4
defog academic db init
john-sanchez31 Aug 25, 2025
d4a592a
Merge branch 'main' into John/acaddb
john-sanchez31 Aug 27, 2025
b6eb2d4
conflicts with main solved
john-sanchez31 Sep 26, 2025
68aab6c
init data postgres and sf, metadata added
john-sanchez31 Sep 30, 2025
3057d52
adding pydough functions, fixing metadata
john-sanchez31 Sep 30, 2025
92ad7d3
WIP: academic metadata naming and descriptions
john-sanchez31 Oct 1, 2025
68228da
academic metadata
john-sanchez31 Oct 2, 2025
8fd10b1
metadata fixed and WIP gen1 pydough
john-sanchez31 Oct 2, 2025
690b662
WIP 12 gen questions
john-sanchez31 Oct 3, 2025
ddc3cfd
Merge branch 'main' into John/acaddb
john-sanchez31 Oct 3, 2025
4078f93
gen test 13-20
john-sanchez31 Oct 6, 2025
70bb6f4
Merge branch 'main' into John/acaddb
john-sanchez31 Oct 6, 2025
831c1bc
mysql and sf e2e test added
john-sanchez31 Oct 7, 2025
ec9106d
minor fixes
john-sanchez31 Oct 8, 2025
28e4857
Merge branch 'main' into John/acaddb [run all]
john-sanchez31 Oct 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 148 additions & 0 deletions tests/gen_data/init_defog_mysql.sql
Original file line number Diff line number Diff line change
Expand Up @@ -906,3 +906,151 @@ VALUES
(13, 12, 'Vitamin D', '2022-12-01', NULL, 1000, 'IU', 24),
(14, 13, 'Acetaminophen', '2023-01-08', '2023-01-14', 500, 'mg', 6),
(15, 14, 'Hydrocortisone cream', '2023-02-25', '2023-03-07', 10, 'g', 12);


DROP DATABASE IF EXISTS `academic`;
CREATE DATABASE `academic`;
USE `academic`;

CREATE TABLE `author` (
`aid` BIGINT NOT NULL,
`homepage` TEXT,
`name` TEXT,
`oid` BIGINT,
PRIMARY KEY (`aid`)
);

CREATE TABLE `cite` (
`cited` BIGINT,
`citing` BIGINT
);

CREATE TABLE `conference` (
`cid` BIGINT NOT NULL,
`homepage` TEXT,
`name` TEXT,
PRIMARY KEY (`cid`)
);

CREATE TABLE `domain` (
`did` BIGINT NOT NULL,
`name` TEXT,
PRIMARY KEY (`did`)
);

CREATE TABLE `domain_author` (
`aid` BIGINT NOT NULL,
`did` BIGINT NOT NULL
);

CREATE TABLE `domain_conference` (
`cid` BIGINT NOT NULL,
`did` BIGINT NOT NULL
);

CREATE TABLE `domain_journal` (
`did` BIGINT NOT NULL,
`jid` BIGINT NOT NULL
);

CREATE TABLE `domain_keyword` (
`did` BIGINT NOT NULL,
`kid` BIGINT NOT NULL
);

CREATE TABLE `domain_publication` (
`did` BIGINT NOT NULL,
`pid` BIGINT NOT NULL
);

CREATE TABLE `journal` (
`homepage` TEXT,
`jid` BIGINT NOT NULL,
`name` TEXT,
PRIMARY KEY (`jid`)
);

CREATE TABLE `keyword` (
`keyword` TEXT,
`kid` BIGINT NOT NULL,
PRIMARY KEY (`kid`)
);

CREATE TABLE `organization` (
`continent` TEXT,
`homepage` TEXT,
`name` TEXT,
`oid` BIGINT NOT NULL,
PRIMARY KEY (`oid`)
);

CREATE TABLE `publication` (
`abstract` TEXT,
`cid` BIGINT,
`citation_num` BIGINT,
`jid` BIGINT,
`pid` BIGINT NOT NULL,
`reference_num` BIGINT,
`title` TEXT,
`year` BIGINT,
PRIMARY KEY (`pid`)
);

CREATE TABLE `publication_keyword` (
`pid` BIGINT NOT NULL,
`kid` BIGINT NOT NULL
);

CREATE TABLE `writes` (
`aid` BIGINT NOT NULL,
`pid` BIGINT NOT NULL
);

INSERT INTO `author` (`aid`, `homepage`, `name`, `oid`) VALUES
(1, 'www.larry.com', 'Larry Summers', 2),
(2, 'www.ashish.com', 'Ashish Vaswani', 3),
(3, 'www.noam.com', 'Noam Shazeer', 3),
(4, 'www.martin.com', 'Martin Odersky', 4),
(5, NULL, 'Kempinski', NULL);

INSERT INTO `cite` (`cited`, `citing`) VALUES
(1, 2), (1, 3), (1, 4), (1, 5), (2, 3), (2, 5), (3, 4), (3, 5), (4, 5);

INSERT INTO `conference` (`cid`, `homepage`, `name`) VALUES
(1, 'www.isa.com', 'ISA'), (2, 'www.aaas.com', 'AAAS'), (3, 'www.icml.com', 'ICML');

INSERT INTO `domain` (`did`, `name`) VALUES
(1, 'Data Science'), (2, 'Natural Sciences'), (3, 'Computer Science'), (4, 'Sociology'), (5, 'Machine Learning');

INSERT INTO `domain_author` (`aid`, `did`) VALUES
(1, 2), (1, 4), (2, 3), (2, 1), (2, 5), (3, 5), (3, 3), (4, 3);

INSERT INTO `domain_conference` (`cid`, `did`) VALUES
(1, 2), (2, 4), (3, 5);

INSERT INTO `domain_journal` (`did`, `jid`) VALUES
(1, 2), (2, 3), (5, 4);

INSERT INTO `domain_keyword` (`did`, `kid`) VALUES
(1, 2), (2, 3);

INSERT INTO `domain_publication` (`did`, `pid`) VALUES
(4, 1), (2, 2), (1, 3), (3, 4), (3, 5), (5, 5);

INSERT INTO `journal` (`homepage`, `jid`, `name`) VALUES
('www.aijournal.com', 1, 'Journal of Artificial Intelligence Research'), ('www.nature.com', 2, 'Nature'), ('www.science.com', 3, 'Science'), ('www.ml.com', 4, 'Journal of Machine Learning Research');

INSERT INTO `keyword` (`keyword`, `kid`) VALUES
('AI', 1), ('Neuroscience', 2), ('Machine Learning', 3), ('Keyword 4', 4);

INSERT INTO `organization` (`continent`, `homepage`, `name`, `oid`) VALUES
('Asia', 'www.organization1.com', 'Organization 1', 1), ('North America', 'www.organization2.com', 'Organization 2', 2), ('North America', 'www.organization3.com', 'Organization 3', 3), ('Europe', 'www.epfl.com', 'École Polytechnique Fédérale de Lausanne 4', 4), ('Europe', 'www.organization5.com', 'Organization 5', 5);

INSERT INTO `publication` (`abstract`, `cid`, `citation_num`, `jid`, `pid`, `reference_num`, `title`, `year`) VALUES
('Abstract 1', 1, 4, 1, 1, 0, 'The Effects of Climate Change on Agriculture', 2020), ('Abstract 2', 2, 2, 2, 2, 1, 'A Study on the Effects of Social Media on Mental Health', 2020), ('Abstract 3', 3, 2, 2, 3, 2, 'Data Mining Techniques', 2021), ('Abstract 4', 3, 1, 2, 4, 2, 'Optimizing GPU Throughput', 2021), ('Abstract 5', 3, 0, 4, 5, 4, 'Attention is all you need', 2021);

INSERT INTO `publication_keyword` (`pid`, `kid`) VALUES
(1, 2), (2, 3);

INSERT INTO `writes` (`aid`, `pid`) VALUES
(1, 1), (1, 2), (2, 3), (2, 4), (2, 5), (3, 5);
229 changes: 229 additions & 0 deletions tests/gen_data/init_defog_postgres.sql
Original file line number Diff line number Diff line change
Expand Up @@ -1058,3 +1058,232 @@ VALUES
(13, 12, 'Vitamin D', '2022-12-01', NULL, 1000, 'IU', 24),
(14, 13, 'Acetaminophen', '2023-01-08', '2023-01-14', 500, 'mg', 6),
(15, 14, 'Hydrocortisone cream', '2023-02-25', '2023-03-07', 10, 'g', 12);

-- ACADEMIC
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My concern with this format (all defog tables in the same schema: main) is that we could have name conflicts if any table has the same name. These tables should be in a different schema ACADEMIC, the same administrative order we use in snowflake. The same can be done for SQLite using ATTACH. Probably also for MySQL using schemas.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If that is the case, we don't really need 4 different metadata files if the types used are compatible with the metadata type.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is copying how things are done for the real defog benchmark. We can change things potentially, but for now let's keep it consistent.

DROP TABLE IF EXISTS main.author CASCADE;
CREATE TABLE main.author (
aid bigint NOT NULL,
homepage text,
name text,
oid bigint
);

DROP TABLE IF EXISTS main.cite CASCADE;
CREATE TABLE main.cite (
cited bigint,
citing bigint
);

DROP TABLE IF EXISTS main.conference CASCADE;
CREATE TABLE main.conference (
cid bigint NOT NULL,
homepage text,
name text
);

DROP TABLE IF EXISTS main.domain CASCADE;
CREATE TABLE main.domain (
did bigint NOT NULL,
name text
);


DROP TABLE IF EXISTS main.domain_author CASCADE;
CREATE TABLE main.domain_author (
aid bigint NOT NULL,
did bigint NOT NULL
);


DROP TABLE IF EXISTS main.domain_conference CASCADE;
CREATE TABLE main.domain_conference (
cid bigint NOT NULL,
did bigint NOT NULL
);


DROP TABLE IF EXISTS main.domain_journal CASCADE;
CREATE TABLE main.domain_journal (
did bigint NOT NULL,
jid bigint NOT NULL
);


DROP TABLE IF EXISTS main.domain_keyword CASCADE;
CREATE TABLE main.domain_keyword (
did bigint NOT NULL,
kid bigint NOT NULL
);


DROP TABLE IF EXISTS main.domain_publication CASCADE;
CREATE TABLE main.domain_publication (
did bigint NOT NULL,
pid bigint NOT NULL
);


DROP TABLE IF EXISTS main.journal CASCADE;
CREATE TABLE main.journal (
homepage text,
jid bigint NOT NULL,
name text
);


DROP TABLE IF EXISTS main.keyword CASCADE;
CREATE TABLE main.keyword (
keyword text,
kid bigint NOT NULL
);


DROP TABLE IF EXISTS main.organization CASCADE;
CREATE TABLE main.organization (
continent text,
homepage text,
name text,
oid bigint NOT NULL
);


DROP TABLE IF EXISTS main.publication CASCADE;
CREATE TABLE main.publication (
abstract text,
cid bigint,
citation_num bigint,
jid bigint,
pid bigint NOT NULL,
reference_num bigint,
title text,
year bigint
);


DROP TABLE IF EXISTS main.publication_keyword CASCADE;
CREATE TABLE main.publication_keyword (
pid bigint NOT NULL,
kid bigint NOT NULL
);


DROP TABLE IF EXISTS main.writes CASCADE;
CREATE TABLE main.writes (
aid bigint NOT NULL,
pid bigint NOT NULL
);


INSERT INTO main.author (aid, homepage, name, oid) VALUES
(1, 'www.larry.com', 'Larry Summers', 2),
(2, 'www.ashish.com', 'Ashish Vaswani', 3),
(3, 'www.noam.com', 'Noam Shazeer', 3),
(4, 'www.martin.com', 'Martin Odersky', 4),
(5, NULL, 'Kempinski', NULL);


INSERT INTO main.cite (cited, citing) VALUES
(1, 2),
(1, 3),
(1, 4),
(1, 5),
(2, 3),
(2, 5),
(3, 4),
(3, 5),
(4, 5);


INSERT INTO main.conference (cid, homepage, name) VALUES
(1, 'www.isa.com', 'ISA'),
(2, 'www.aaas.com', 'AAAS'),
(3, 'www.icml.com', 'ICML');


INSERT INTO main.domain (did, name) VALUES
(1, 'Data Science'),
(2, 'Natural Sciences'),
(3, 'Computer Science'),
(4, 'Sociology'),
(5, 'Machine Learning');


INSERT INTO main.domain_author (aid, did) VALUES
(1, 2),
(1, 4),
(2, 3),
(2, 1),
(2, 5),
(3, 5),
(3, 3),
(4, 3);


INSERT INTO main.domain_conference (cid, did) VALUES
(1, 2),
(2, 4),
(3, 5);


INSERT INTO main.domain_journal (did, jid) VALUES
(1, 2),
(2, 3),
(5, 4);


INSERT INTO main.domain_keyword (did, kid) VALUES
(1, 2),
(2, 3);


INSERT INTO main.domain_publication (did, pid) VALUES
(4, 1),
(2, 2),
(1, 3),
(3, 4),
(3, 5),
(5, 5);


INSERT INTO main.journal (homepage, jid, name) VALUES
('www.aijournal.com', 1, 'Journal of Artificial Intelligence Research'),
('www.nature.com', 2, 'Nature'),
('www.science.com', 3, 'Science'),
('www.ml.com', 4, 'Journal of Machine Learning Research');


INSERT INTO main.keyword (keyword, kid) VALUES
('AI', 1),
('Neuroscience', 2),
('Machine Learning', 3),
('Keyword 4', 4);


INSERT INTO main.organization (continent, homepage, name, oid) VALUES
('Asia', 'www.organization1.com', 'Organization 1', 1),
('North America', 'www.organization2.com', 'Organization 2', 2),
('North America', 'www.organization3.com', 'Organization 3', 3),
('Europe', 'www.epfl.com', 'École Polytechnique Fédérale de Lausanne 4', 4),
('Europe', 'www.organization5.com', 'Organization 5', 5);


INSERT INTO main.publication (abstract, cid, citation_num, jid, pid, reference_num, title, year) VALUES
('Abstract 1', 1, 4, 1, 1, 0, 'The Effects of Climate Change on Agriculture', 2020),
('Abstract 2', 2, 2, 2, 2, 1, 'A Study on the Effects of Social Media on Mental Health', 2020),
('Abstract 3', 3, 2, 2, 3, 2, 'Data Mining Techniques', 2021),
('Abstract 4', 3, 1, 2, 4, 2, 'Optimizing GPU Throughput', 2021),
('Abstract 5', 3, 0, 4, 5, 4, 'Attention is all you need', 2021);


INSERT INTO main.publication_keyword (pid, kid) VALUES
(1, 2),
(2, 3);


INSERT INTO main.writes (aid, pid) VALUES
(1, 1),
(1, 2),
(2, 3),
(2, 4),
(2, 5),
(3, 5);
Loading