|
| 1 | +# This code is based on the following gist |
| 2 | +# https://gist.github.com/enricorotundo/1e074af39d90629252a7df3fc1066397 |
| 3 | +# Some comments are taken from the following post in stackExchange |
| 4 | +# https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede |
| 5 | +# We added the content licence for comments, posts and postlinks, |
| 6 | +# Be sure of having by default innodb tables, otherwise specify it (needed for a text full index) |
| 7 | +# If you are having problems loading data into your DataBase you must |
| 8 | +# 1) run mysql with --secure_file_priv="" |
| 9 | +# 2) If you are on a mac an your are having problems running the complementary files, you can run amysql with mysqld_safe --secure_file_priv="" located on BIN folder |
| 10 | +# 3) The path of your xml file has to be an absolute path |
| 11 | + |
| 12 | +# - -------- ----------- ------------ --------------- ---------- |
| 13 | + |
| 14 | + |
| 15 | + |
| 16 | +# ---PART 1---------------------------------- CREATE the post TABLE FOR stackoverflow (the only thing that changes with the other tables is the abscence of title-body full index) |
| 17 | +# ------------------------------------- It also creates the schema for the auxiliar table (PostsQuestionsFiltered) |
| 18 | + |
| 19 | + |
| 20 | +CREATE TABLE Posts ( |
| 21 | + Id INT NOT NULL PRIMARY KEY, |
| 22 | + PostTypeId TINYINT NOT NULL, |
| 23 | + # 1 = Question |
| 24 | + # 2 = Answer |
| 25 | + # 3 = Orphaned tag wiki |
| 26 | + # 4 = Tag wiki excerpt |
| 27 | + # 5 = Tag wiki |
| 28 | + # 6 = Moderator nomination |
| 29 | + # 7 = "Wiki placeholder" (seems to only be the election description) |
| 30 | + # 8 = Privilege wiki |
| 31 | + AcceptedAnswerId INT, |
| 32 | + ParentId INT, |
| 33 | + CreationDate DATETIME NOT NULL, |
| 34 | + DeletionDate DATETIME, |
| 35 | + Score INT NULL, |
| 36 | + ViewCount INT NULL, |
| 37 | + Body text NULL, |
| 38 | + OwnerUserId INT, |
| 39 | + OwnerDisplayName varchar(256), |
| 40 | + LastEditorUserId INT, |
| 41 | + LastEditorDisplayName VARCHAR(40), |
| 42 | + LastEditDate DATETIME, |
| 43 | + LastActivityDate DATETIME, |
| 44 | + Title varchar(256), |
| 45 | + Tags VARCHAR(256), |
| 46 | + AnswerCount INT DEFAULT 0, |
| 47 | + CommentCount INT DEFAULT 0, |
| 48 | + FavoriteCount INT DEFAULT 0, |
| 49 | + ClosedDate DATETIME, |
| 50 | + CommunityOwnedDate DATETIME, |
| 51 | + ContentLicense VARCHAR(20) |
| 52 | +); |
| 53 | + |
| 54 | +CREATE TABLE PostsQuestionsFiltered ( |
| 55 | + Id INT NOT NULL PRIMARY KEY, |
| 56 | + PostTypeId TINYINT NOT NULL, |
| 57 | + # 1 = Question |
| 58 | + # 2 = Answer |
| 59 | + # 3 = Orphaned tag wiki |
| 60 | + # 4 = Tag wiki excerpt |
| 61 | + # 5 = Tag wiki |
| 62 | + # 6 = Moderator nomination |
| 63 | + # 7 = "Wiki placeholder" (seems to only be the election description) |
| 64 | + # 8 = Privilege wiki |
| 65 | + AcceptedAnswerId INT, |
| 66 | + ParentId INT, |
| 67 | + CreationDate DATETIME NOT NULL, |
| 68 | + DeletionDate DATETIME, |
| 69 | + Score INT NULL, |
| 70 | + ViewCount INT NULL, |
| 71 | + Body text NULL, |
| 72 | + OwnerUserId INT, |
| 73 | + OwnerDisplayName varchar(256), |
| 74 | + LastEditorUserId INT, |
| 75 | + LastEditorDisplayName VARCHAR(40), |
| 76 | + LastEditDate DATETIME, |
| 77 | + LastActivityDate DATETIME, |
| 78 | + Title varchar(256), |
| 79 | + Tags VARCHAR(256), |
| 80 | + AnswerCount INT DEFAULT 0, |
| 81 | + CommentCount INT DEFAULT 0, |
| 82 | + FavoriteCount INT DEFAULT 0, |
| 83 | + ClosedDate DATETIME, |
| 84 | + CommunityOwnedDate DATETIME, |
| 85 | + ContentLicense VARCHAR(20) |
| 86 | +); |
| 87 | + |
| 88 | +SELECT |
| 89 | + DATABASE(); |
| 90 | + |
| 91 | +select |
| 92 | + count(*) postsCount |
| 93 | +from |
| 94 | + Posts; |
| 95 | + |
| 96 | +# need this (it depends on your security and OS) |
| 97 | +load xml LOCAL infile '/[path_to_the_base_folder]/[temp]/Posts.xml' into table Posts rows identified by '<row>'; |
| 98 | + |
| 99 | + |
| 100 | +show databases; |
| 101 | + |
| 102 | +create index Posts_idx_1 on Posts(AcceptedAnswerId); |
| 103 | + |
| 104 | +create index Posts_idx_2 on Posts(ParentId); |
| 105 | + |
| 106 | +create index Posts_idx_3 on Posts(OwnerUserId); |
| 107 | + |
| 108 | +create index Posts_idx_4 on Posts(LastEditorUserId); |
| 109 | + |
| 110 | +SHOW INDEX |
| 111 | +FROM |
| 112 | + Posts; |
| 113 | +CREATE FULLTEXT INDEX index_Tags ON Posts(Tags); |
| 114 | + |
| 115 | +SHOW INDEX |
| 116 | +FROM |
| 117 | + Posts; |
| 118 | + |
| 119 | + |
| 120 | +# ---PART 2------- (ARTICLE) Query that filters ALL the posts that ARE questions (This is for creating a smaller STO tables) |
| 121 | + |
| 122 | + |
| 123 | +#----PART 2.1----- CREATE PostsQuestionsFiltered |
| 124 | + |
| 125 | +INSERT INTO PostsQuestionsFiltered |
| 126 | +SELECT * FROM Posts p |
| 127 | +WHERE p.PostTypeId = 1 and |
| 128 | +p.Score >0 and p.AcceptedAnswerId is not NULL and |
| 129 | +MATCH (p.Tags) AGAINST ('"machine-learning"' IN BOOLEAN MODE); |
| 130 | + |
| 131 | + |
| 132 | +# --- Due TO INSERT performance this creation IS done AFTER the IS loaded |
| 133 | +create index Posts_idx_1 on PostsQuestionsFiltered(AcceptedAnswerId); |
| 134 | + |
| 135 | +create index Posts_idx_2 on PostsQuestionsFiltered(ParentId); |
| 136 | + |
| 137 | +create index Posts_idx_3 on PostsQuestionsFiltered(OwnerUserId); |
| 138 | + |
| 139 | +create index Posts_idx_4 on PostsQuestionsFiltered(LastEditorUserId); |
| 140 | + |
| 141 | +SHOW INDEX |
| 142 | +FROM |
| 143 | + PostsQuestionsFiltered; |
| 144 | + |
| 145 | +CREATE FULLTEXT INDEX index_Tags ON PostsQuestionsFiltered(Tags); |
| 146 | + |
| 147 | +SHOW INDEX |
| 148 | +FROM |
| 149 | + PostsQuestionsFiltered; |
| 150 | + |
| 151 | +CREATE FULLTEXT INDEX index_Text_title ON PostsQuestionsFiltered(Title, Body); |
| 152 | + |
| 153 | +SHOW INDEX |
| 154 | +FROM |
| 155 | + PostsQuestionsFiltered; |
| 156 | + |
| 157 | + |
| 158 | + |
| 159 | +# ------PART 3---------------------------- Query that filters ALL the posts that ARE questions (This is for creating a smaller STO tables) NOT filtering BY score |
| 160 | +# ------------------- We assume that we have already the Post TABLE created WITH the INDEX above IN PART 1 |
| 161 | + |
| 162 | +CREATE TABLE PostsQuestionsFilteredNoScore ( |
| 163 | + Id INT NOT NULL PRIMARY KEY, |
| 164 | + PostTypeId TINYINT NOT NULL, |
| 165 | + # 1 = Question |
| 166 | + # 2 = Answer |
| 167 | + # 3 = Orphaned tag wiki |
| 168 | + # 4 = Tag wiki excerpt |
| 169 | + # 5 = Tag wiki |
| 170 | + # 6 = Moderator nomination |
| 171 | + # 7 = "Wiki placeholder" (seems to only be the election description) |
| 172 | + # 8 = Privilege wiki |
| 173 | + AcceptedAnswerId INT, |
| 174 | + ParentId INT, |
| 175 | + CreationDate DATETIME NOT NULL, |
| 176 | + DeletionDate DATETIME, |
| 177 | + Score INT NULL, |
| 178 | + ViewCount INT NULL, |
| 179 | + Body text NULL, |
| 180 | + OwnerUserId INT, |
| 181 | + OwnerDisplayName varchar(256), |
| 182 | + LastEditorUserId INT, |
| 183 | + LastEditorDisplayName VARCHAR(40), |
| 184 | + LastEditDate DATETIME, |
| 185 | + LastActivityDate DATETIME, |
| 186 | + Title varchar(256), |
| 187 | + Tags VARCHAR(256), |
| 188 | + AnswerCount INT DEFAULT 0, |
| 189 | + CommentCount INT DEFAULT 0, |
| 190 | + FavoriteCount INT DEFAULT 0, |
| 191 | + ClosedDate DATETIME, |
| 192 | + CommunityOwnedDate DATETIME, |
| 193 | + ContentLicense VARCHAR(20) |
| 194 | +); |
| 195 | + |
| 196 | + |
| 197 | + |
| 198 | +INSERT INTO PostsQuestionsFilteredNoScore |
| 199 | +SELECT * FROM Posts p |
| 200 | +WHERE p.PostTypeId = 1 and |
| 201 | +p.AcceptedAnswerId is not NULL and |
| 202 | +MATCH (p.Tags) AGAINST ('"machine-learning"' IN BOOLEAN MODE); |
| 203 | + |
| 204 | + |
| 205 | + |
| 206 | + |
| 207 | + |
| 208 | + |
| 209 | + |
| 210 | + |
| 211 | + |
| 212 | + |
| 213 | + |
| 214 | + |
| 215 | + |
| 216 | + |
0 commit comments