Skip to content

Commit a76483b

Browse files
committed
cleanup of bagit structure
1 parent db2a6ab commit a76483b

File tree

1 file changed

+75
-125
lines changed

1 file changed

+75
-125
lines changed

app/api/Datasets.scala

Lines changed: 75 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -2081,8 +2081,7 @@ class Datasets @Inject()(
20812081
val filenameMap = scala.collection.mutable.Map.empty[UUID, String]
20822082
val inputFiles = scala.collection.mutable.ListBuffer.empty[models.File]
20832083

2084-
// compute list of all files and folder in dataset. This will also make sure
2085-
// that all files and folder names are unique.
2084+
// Get list of all files and folder in dataset and enforce unique names
20862085
fileIDs match {
20872086
case Some(fids) => {
20882087
Logger.info("Downloading only some files")
@@ -2100,179 +2099,118 @@ class Datasets @Inject()(
21002099
val md5Files = scala.collection.mutable.HashMap.empty[String, MessageDigest] //for the files
21012100
val md5Bag = scala.collection.mutable.HashMap.empty[String, MessageDigest] //for the bag files
21022101

2103-
// which file we are currently processing
2104-
21052102
val byteArrayOutputStream = new ByteArrayOutputStream(chunkSize)
21062103
val zip = new ZipOutputStream(byteArrayOutputStream)
2107-
// zip compression level
21082104
zip.setLevel(compression)
21092105

2106+
// Prep enumeration handlers
21102107
var totalBytes = 0L
2111-
var level = 0 //dataset,file, bag
2112-
var file_type = 0 //
2113-
var count = 0 //count for files
2114-
2115-
/*
2116-
* Explanation for the cases
2117-
*
2118-
* the level can be:
2119-
* 0 (file)
2120-
* 1 (dataset)
2121-
* 2 (bag)
2122-
*
2123-
* when the level is file, the file_type can be:
2124-
* 0 (info)
2125-
* 1 (metadata)
2126-
* 2 (the actual files)
2127-
*
2128-
* when the level is dataset, the file_type can be:
2129-
* 0 (info)
2130-
* 1 (metadata)
2131-
*
2132-
* when the level is bag, the file_type can be:
2133-
* 0 - bagit.txt
2134-
* 1 - bag-info.txt
2135-
* 2 - manifest-md5.txt
2136-
* 3 - tagmanifest-md5.txt
2137-
*
2138-
* when the dataset is finished (in either mode) the level = -1 and file_type = -1 and
2139-
* the enumerator is finished
2140-
*/
2108+
var level = "dataset"
2109+
var file_type = "metadata"
2110+
var file_index = 0 //count for files
21412111

2142-
var is: Option[InputStream] = addDatasetInfoToZip(dataFolder, dataset, zip)
2143-
//digest input stream
2112+
// Begin input stream with dataset info file
2113+
var is = addDatasetInfoToZip(dataFolder, dataset, zip)
21442114
val md5 = MessageDigest.getInstance("MD5")
2145-
md5Files.put(dataFolder+"_info.json",md5)
2146-
is = Some(new DigestInputStream(is.get,md5))
2147-
file_type = 1 //next is metadata
2115+
md5Files.put(dataFolder+"_info.json", md5)
2116+
is = Some(new DigestInputStream(is.get, md5))
21482117

2118+
// Handle rest of dataset structure by individual file
21492119
Enumerator.generateM({
21502120
is match {
21512121
case Some(inputStream) => {
21522122
val buffer = new Array[Byte](chunkSize)
21532123
val bytesRead = scala.concurrent.blocking {
21542124
inputStream.read(buffer)
2155-
21562125
}
21572126
val chunk = bytesRead match {
21582127
case -1 => {
21592128
// finished individual file
21602129
zip.closeEntry()
21612130
inputStream.close()
21622131

2163-
(level,file_type) match {
2164-
//dataset, info
2165-
case (0,0) => {
2166-
is = addDatasetInfoToZip(dataFolder,dataset,zip)
2167-
val md5 = MessageDigest.getInstance("MD5")
2168-
md5Files.put("_info.json",md5)
2169-
is = Some(new DigestInputStream(is.get, md5))
2170-
file_type = file_type + 1
2132+
(level, file_type) match {
2133+
case ("dataset", "metadata") => {
2134+
is = addDatasetMetadataToZip(dataFolder, dataset, zip)
2135+
is = addMD5Entry("_metadata.json", is, md5Files)
2136+
val (level, file_type) = ("file", "info")
21712137
}
2172-
//dataset, metadata
2173-
case (0,1) => {
2174-
is = addDatasetMetadataToZip(dataFolder,dataset,zip)
2175-
val md5 = MessageDigest.getInstance("MD5")
2176-
md5Files.put("_metadata.json",md5)
2177-
is = Some(new DigestInputStream(is.get, md5))
2178-
level = 1
2179-
file_type = 0
2180-
}
2181-
//file info
2182-
case (1,0) =>{
2183-
is = addFileInfoToZip(filenameMap(inputFiles(count).id), inputFiles(count), zip)
2184-
val md5 = MessageDigest.getInstance("MD5")
2185-
md5Files.put(filenameMap(inputFiles(count).id)+"_info.json",md5)
2186-
is = Some(new DigestInputStream(is.get, md5))
2187-
if (count+1 < inputFiles.size ){
2188-
count +=1
2189-
} else {
2190-
count = 0
2191-
file_type = 1
2138+
case ("file", "info") => {
2139+
val filename = filenameMap(inputFiles(file_index).id)
2140+
is = addFileInfoToZip(filename, inputFiles(file_index), zip)
2141+
is = addMD5Entry(filename+"_info.json", is, md5Files)
2142+
file_index += 1
2143+
if (file_index >= inputFiles.size) {
2144+
file_index = 0
2145+
file_type = "metadata"
21922146
}
21932147
}
2194-
//file metadata
2195-
case (1,1) =>{
2196-
is = addFileMetadataToZip(filenameMap(inputFiles(count).id), inputFiles(count), zip)
2197-
val md5 = MessageDigest.getInstance("MD5")
2198-
md5Files.put(filenameMap(inputFiles(count).id)+"_metadata.json",md5)
2199-
is = Some(new DigestInputStream(is.get, md5))
2200-
if (count+1 < inputFiles.size ){
2201-
count +=1
2202-
} else {
2203-
count = 0
2204-
file_type = 2
2148+
case ("file", "metadata") => {
2149+
val filename = filenameMap(inputFiles(file_index).id)
2150+
is = addFileMetadataToZip(filename, inputFiles(file_index), zip)
2151+
is = addMD5Entry(filename+"_metadata.json", is, md5Files)
2152+
file_index += 1
2153+
if (file_index >= inputFiles.size){
2154+
file_index = 0
2155+
file_type = "bytes"
22052156
}
22062157
}
2207-
//files
2208-
case (1,2) => {
2209-
is = addFileToZip(filenameMap(inputFiles(count).id), inputFiles(count), zip)
2210-
val md5 = MessageDigest.getInstance("MD5")
2211-
md5Files.put(filenameMap(inputFiles(count).id),md5)
2212-
is = Some(new DigestInputStream(is.get, md5))
2213-
if (count+1 < inputFiles.size ){
2214-
count +=1
2215-
} else {
2216-
if (bagit){
2217-
count = 0
2218-
level = 2
2219-
file_type = 0
2158+
case ("file", "bytes") => {
2159+
val filename = filenameMap(inputFiles(file_index).id)
2160+
is = addFileToZip(filename, inputFiles(file_index), zip)
2161+
is = addMD5Entry(filename, is, md5Files)
2162+
file_index +=1
2163+
if (file_index >= inputFiles.size) {
2164+
if (bagit) {
2165+
file_index = 0
2166+
val (level, file_type) = ("bag", "bagit.txt")
22202167
} else {
2221-
//done
2222-
level = -1
2223-
file_type = -1
2168+
val (level, file_type) = ("done", "none")
22242169
}
2225-
22262170
}
22272171
}
2228-
//bagit.txt
2229-
case (2,0) => {
2172+
case ("bag", "bagit.txt") => {
22302173
is = addBagItTextToZip(totalBytes,filenameMap.size,zip,dataset,user)
2231-
val md5 = MessageDigest.getInstance("MD5")
2232-
md5Bag.put("bagit.txt",md5)
2233-
is = Some(new DigestInputStream(is.get, md5))
2234-
file_type = 1
2174+
is = addMD5Entry("bagit.txt", is, md5Files)
2175+
file_type = "bag-info.txt"
22352176
}
2236-
//bag-info.txt
2237-
case (2,1) => {
2177+
case ("bag", "bag-info.txt") => {
22382178
is = addBagInfoToZip(zip)
2239-
val md5 = MessageDigest.getInstance("MD5")
2240-
md5Bag.put("bag-info.txt",md5)
2241-
is = Some(new DigestInputStream(is.get, md5))
2242-
file_type = 2
2179+
is = addMD5Entry("bag-info.txt", is, md5Files)
2180+
file_type = "manifest-md5.txt"
22432181
}
2244-
//manifest-md5.txt
2245-
case (2,2) => {
2182+
case ("bag", "manifest-md5.txt") => {
22462183
is = addManifestMD5ToZip(md5Files.toMap[String,MessageDigest],zip)
2247-
val md5 = MessageDigest.getInstance("MD5")
2248-
md5Bag.put("manifest-md5.txt",md5)
2249-
is = Some(new DigestInputStream(is.get, md5))
2250-
file_type = 3
2184+
is = addMD5Entry("manifest-md5.txt", is, md5Files)
2185+
file_type = "datacite.xml"
2186+
}
2187+
case ("bag", "datacite.xml") => {
2188+
is = addBagitMetadataToZip(zip)
2189+
file_type = "tagmanifest-md5.txt"
22512190
}
2252-
//tagmanifest-md5.txt
2253-
case (2,3) => {
2191+
case ("bag", "tagmanifest-md5.txt") => {
22542192
is = addTagManifestMD5ToZip(md5Bag.toMap[String,MessageDigest],zip)
2255-
val md5 = MessageDigest.getInstance("MD5")
2256-
md5Bag.put("tagmanifest-md5.txt",md5)
2257-
is = Some(new DigestInputStream(is.get, md5))
2258-
level = -1
2259-
file_type = -1
2193+
is = addMD5Entry("tagmanifest-md5.txt", is, md5Files)
2194+
val (level, file_type) = ("done", "none")
2195+
}
2196+
case ("done", "done") => {
2197+
zip.close()
2198+
is = None
22602199
}
2261-
//the end, or a bad case
22622200
case (_,_) => {
2201+
Logger.error("Unexpected values in dataset zip enum. Closing out anyway.")
22632202
zip.close()
22642203
is = None
22652204
}
22662205
}
2267-
//this is generated after all the matches
22682206
Some(byteArrayOutputStream.toByteArray)
22692207
}
22702208
case read => {
22712209
zip.write(buffer, 0, read)
22722210
Some(byteArrayOutputStream.toByteArray)
22732211
}
22742212
}
2275-
if (level < 2){
2213+
if (level == "file" || level == "dataset"){
22762214
totalBytes += bytesRead
22772215
}
22782216
// reset temporary byte array
@@ -2286,6 +2224,12 @@ class Datasets @Inject()(
22862224
})(pec)
22872225
}
22882226

2227+
private def addMD5Entry(name: String, is: Option[InputStream],
2228+
md5HashMap: scala.collection.mutable.HashMap[String, MessageDigest]) = {
2229+
val md5 = MessageDigest.getInstance("MD5")
2230+
md5HashMap.put(name, md5)
2231+
Some(new DigestInputStream(is.get, md5))
2232+
}
22892233

22902234
private def addFileToZip(filename: String, file: models.File, zip: ZipOutputStream): Option[InputStream] = {
22912235
files.getBytes(file.id) match {
@@ -2420,6 +2364,12 @@ class Datasets @Inject()(
24202364
Some(new ByteArrayInputStream(s.getBytes("UTF-8")))
24212365
}
24222366

2367+
private def addBagitMetadataToZip(zip: ZipOutputStream): Option[InputStream] = {
2368+
zip.putNextEntry(new ZipEntry("metadata/datacite.xml"))
2369+
var s = "Datacite v4 stuff goes here."
2370+
Some(new ByteArrayInputStream(s.getBytes("UTF-8")))
2371+
}
2372+
24232373
def download(id: UUID, compression: Int, tracking: Boolean) = PermissionAction(Permission.DownloadFiles, Some(ResourceRef(ResourceRef.dataset, id))) { implicit request =>
24242374
implicit val user = request.user
24252375
datasets.get(id) match {

0 commit comments

Comments
 (0)