Skip to content

Commit 659ac87

Browse files
authored
Merge pull request #78 from common-workflow-language/file-size-limits
File size limits
2 parents f3854ff + bc9843f commit 659ac87

File tree

5 files changed

+103
-71
lines changed

5 files changed

+103
-71
lines changed

src/main/java/org/commonwl/viewer/domain/CWLCollection.java

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,11 @@
2424
import com.fasterxml.jackson.databind.node.ArrayNode;
2525
import com.fasterxml.jackson.databind.node.ObjectNode;
2626
import com.fasterxml.jackson.databind.node.TextNode;
27+
import org.apache.commons.io.FileUtils;
2728
import org.apache.commons.io.FilenameUtils;
2829
import org.commonwl.viewer.services.DockerService;
29-
import org.eclipse.egit.github.core.RepositoryContents;
3030
import org.commonwl.viewer.services.GitHubService;
31+
import org.eclipse.egit.github.core.RepositoryContents;
3132
import org.yaml.snakeyaml.Yaml;
3233

3334
import java.io.IOException;
@@ -43,6 +44,10 @@ public class CWLCollection {
4344
private GithubDetails githubInfo;
4445
private String commitSha;
4546

47+
private int totalFileSize;
48+
private int totalFileSizeLimit;
49+
private int singleFileSizeLimit;
50+
4651
// Maps of ID to associated JSON
4752
private Map<String, JsonNode> workflows = new HashMap<>();
4853

@@ -90,10 +95,13 @@ public class CWLCollection {
9095
* @throws IOException Any API errors which may have occurred
9196
*/
9297
public CWLCollection(GitHubService githubService, GithubDetails githubInfo,
93-
String commitSha) throws IOException {
98+
String commitSha, int singleFileSizeLimit, int totalFileSizeLimit) throws IOException {
9499
this.githubInfo = githubInfo;
95100
this.githubService = githubService;
96101
this.commitSha = commitSha;
102+
this.singleFileSizeLimit = singleFileSizeLimit;
103+
this.totalFileSizeLimit = totalFileSizeLimit;
104+
this.totalFileSize = 0;
97105

98106
// Add any CWL files from the Github repo to this collection
99107
List<RepositoryContents> repoContents = githubService.getContents(githubInfo);
@@ -119,32 +127,46 @@ private void addDocs(List<RepositoryContents> repoContents) throws IOException {
119127
// Add the files in the subdirectory to this new folder
120128
addDocs(subdirectory);
121129

122-
// Otherwise this is a file so add to the bundle
123130
} else if (repoContent.getType().equals(FILE)) {
124131

125-
// Get the file extension
126-
int eIndex = repoContent.getName().lastIndexOf('.') + 1;
127-
if (eIndex > 0) {
128-
String extension = repoContent.getName().substring(eIndex);
129-
130-
// If this is a cwl file which needs to be parsed
131-
if (extension.equals(CWL_EXTENSION)) {
132-
133-
// Get the content of this file from Github
134-
GithubDetails githubFile = new GithubDetails(githubInfo.getOwner(),
135-
githubInfo.getRepoName(), githubInfo.getBranch(), repoContent.getPath());
136-
String fileContent = githubService.downloadFile(githubFile, commitSha);
137-
138-
// Parse yaml to JsonNode
139-
Yaml reader = new Yaml();
140-
ObjectMapper mapper = new ObjectMapper();
141-
JsonNode cwlFile = mapper.valueToTree(reader.load(fileContent));
132+
// Keep track of total file size for limit - only track files which
133+
// will be added to the RO bundle due to being small enough
134+
if (repoContent.getSize() <= singleFileSizeLimit) {
135+
totalFileSize += repoContent.getSize();
136+
}
142137

143-
// Add document to those being considered
144-
addDoc(cwlFile, repoContent.getName());
138+
if (totalFileSize <= totalFileSizeLimit) {
139+
// Get the file extension
140+
int eIndex = repoContent.getName().lastIndexOf('.') + 1;
141+
if (eIndex > 0) {
142+
String extension = repoContent.getName().substring(eIndex);
143+
144+
// If this is a cwl file which needs to be parsed
145+
if (extension.equals(CWL_EXTENSION)) {
146+
if (repoContent.getSize() <= singleFileSizeLimit) {
147+
// Get the content of this file from Github
148+
GithubDetails githubFile = new GithubDetails(githubInfo.getOwner(),
149+
githubInfo.getRepoName(), githubInfo.getBranch(), repoContent.getPath());
150+
String fileContent = githubService.downloadFile(githubFile, commitSha);
151+
152+
// Parse yaml to JsonNode
153+
Yaml reader = new Yaml();
154+
ObjectMapper mapper = new ObjectMapper();
155+
JsonNode cwlFile = mapper.valueToTree(reader.load(fileContent));
156+
157+
// Add document to those being considered
158+
addDoc(cwlFile, repoContent.getName());
159+
} else {
160+
throw new IOException("File '" + repoContent.getName() + "' is over singleFileSizeLimit - " +
161+
FileUtils.byteCountToDisplaySize(repoContent.getSize()) + "/" +
162+
FileUtils.byteCountToDisplaySize(singleFileSizeLimit));
163+
}
164+
}
145165
}
166+
} else {
167+
throw new IOException("Contents of the repository are over totalFileSizeLimit of " +
168+
FileUtils.byteCountToDisplaySize(totalFileSizeLimit));
146169
}
147-
148170
}
149171
}
150172
}

src/main/java/org/commonwl/viewer/domain/ROBundle.java

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.commonwl.viewer.domain;
2121

22+
import org.apache.commons.io.FileUtils;
2223
import org.apache.commons.io.FilenameUtils;
2324
import org.apache.taverna.robundle.Bundle;
2425
import org.apache.taverna.robundle.Bundles;
@@ -30,7 +31,6 @@
3031
import org.slf4j.Logger;
3132
import org.slf4j.LoggerFactory;
3233

33-
import java.io.File;
3434
import java.io.IOException;
3535
import java.net.URI;
3636
import java.net.URISyntaxException;
@@ -56,6 +56,7 @@ public class ROBundle {
5656
private GithubDetails githubInfo;
5757
private String commitSha;
5858
private Agent thisApp;
59+
private int singleFileSizeLimit;
5960
private Set<HashableAgent> authors = new HashSet<HashableAgent>();
6061

6162
// Pattern for extracting version from a cwl file
@@ -68,8 +69,9 @@ public class ROBundle {
6869
* @throws IOException Any API errors which may have occurred
6970
*/
7071
public ROBundle(GitHubService githubService, GithubDetails githubInfo, String commitSha,
71-
String appName, String appURL) throws IOException {
72-
// TODO: Add back file size checking on individual files as well as whole bundle
72+
String appName, String appURL, int singleFileSizeLimit) throws IOException {
73+
// File size limits
74+
this.singleFileSizeLimit = singleFileSizeLimit;
7375

7476
// Create a new RO bundle
7577
this.bundle = Bundles.createBundle();
@@ -137,29 +139,48 @@ private void addFiles(List<RepositoryContents> repoContents, Path path) throws I
137139
// Otherwise this is a file so add to the bundle
138140
} else if (repoContent.getType().equals("file")) {
139141

140-
// Get the content of this file from Github
141-
GithubDetails githubFile = new GithubDetails(githubInfo.getOwner(),
142-
githubInfo.getRepoName(), githubInfo.getBranch(), repoContent.getPath());
143-
String fileContent = githubService.downloadFile(githubFile, commitSha);
144-
145-
// Save file to research object bundle
146-
Path newFilePort = path.resolve(repoContent.getName());
147-
Bundles.setStringValue(newFilePort, fileContent);
142+
try {
143+
// Where to store the new file in bundle
144+
Path bundleFilePath = path.resolve(repoContent.getName());
145+
146+
// Raw URI of the bundle
147+
GithubDetails githubFile = new GithubDetails(githubInfo.getOwner(),
148+
githubInfo.getRepoName(), githubInfo.getBranch(), repoContent.getPath());
149+
URI rawURI = new URI("https://raw.githubusercontent.com/" + githubFile.getOwner() + "/" +
150+
githubFile.getRepoName() + "/" + commitSha + "/" + githubFile.getPath());
151+
152+
// Variable to store file contents
153+
String fileContent = null;
154+
155+
// Download or externally link if oversized
156+
if (repoContent.getSize() <= singleFileSizeLimit) {
157+
// Get the content of this file from Github
158+
fileContent = githubService.downloadFile(githubFile, commitSha);
159+
160+
// Save file to research object bundle
161+
Bundles.setStringValue(bundleFilePath, fileContent);
162+
} else {
163+
logger.info("File " + repoContent.getName() + " is too large to download -" +
164+
FileUtils.byteCountToDisplaySize(repoContent.getSize()) + "/" +
165+
FileUtils.byteCountToDisplaySize(singleFileSizeLimit) +
166+
" + linking externally to RO bundle");
167+
bundleFilePath = Bundles.setReference(bundleFilePath, rawURI);
168+
}
148169

149-
// Manifest aggregation
150-
PathMetadata aggregation = bundle.getManifest().getAggregation(newFilePort);
170+
// Manifest aggregation
171+
PathMetadata aggregation = bundle.getManifest().getAggregation(bundleFilePath);
151172

152-
try {
153173
// Special handling for cwl files
154174
if (FilenameUtils.getExtension(repoContent.getName()).equals("cwl")) {
155175
// Correct mime type (no official standard for yaml)
156176
aggregation.setMediatype("text/x-yaml");
157177

158178
// Add conformsTo for version extracted from regex
159-
// Lower overhead vs parsing entire file
160-
Matcher m = cwlVersionPattern.matcher(fileContent);
161-
if (m.find()) {
162-
aggregation.setConformsTo(new URI("https://w3id.org/cwl/" + m.group(1)));
179+
if (fileContent != null) {
180+
Matcher m = cwlVersionPattern.matcher(fileContent);
181+
if (m.find()) {
182+
aggregation.setConformsTo(new URI("https://w3id.org/cwl/" + m.group(1)));
183+
}
163184
}
164185
}
165186

@@ -169,15 +190,13 @@ private void addFiles(List<RepositoryContents> repoContents, Path path) throws I
169190
aggregation.setAuthoredBy(new ArrayList<Agent>(fileAuthors));
170191

171192
// Set retrieved information for this file in the manifest
172-
aggregation.setRetrievedFrom(new URI("https://raw.githubusercontent.com/" + githubFile.getOwner() + "/" +
173-
githubFile.getRepoName() + "/" + commitSha + "/" + githubFile.getPath()));
193+
aggregation.setRetrievedFrom(rawURI);
174194
aggregation.setRetrievedBy(thisApp);
175195
aggregation.setRetrievedOn(aggregation.getCreatedOn());
176196

177197
} catch (URISyntaxException ex) {
178198
logger.error("Error creating URI for RO Bundle", ex);
179199
}
180-
181200
}
182201
}
183202
}

src/main/java/org/commonwl/viewer/services/ROBundleFactory.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,18 +47,21 @@ public class ROBundleFactory {
4747

4848
private final String applicationName;
4949
private final String applicationURL;
50+
private final int singleFileSizeLimit;
5051
private final Path storageLocation;
5152
private final WorkflowRepository workflowRepository;
5253

5354
@Autowired
5455
public ROBundleFactory(@Value("${applicationName}") String applicationName,
5556
@Value("${applicationURL}") String applicationURL,
5657
@Value("${graphvizStorage}") Path graphvizStorage,
58+
@Value("${singleFileSizeLimit}") int singleFileSizeLimit,
5759
WorkflowRepository workflowRepository) {
5860
this.applicationName = applicationName;
5961
this.applicationURL = applicationURL;
6062
this.storageLocation = graphvizStorage;
6163
this.workflowRepository = workflowRepository;
64+
this.singleFileSizeLimit = singleFileSizeLimit;
6265
}
6366

6467
/**
@@ -75,7 +78,7 @@ void workflowROFromGithub(GitHubService githubService, GithubDetails githubInfo,
7578

7679
// Create a new Research Object Bundle with Github contents
7780
ROBundle bundle = new ROBundle(githubService, githubInfo, commitSha,
78-
applicationName, applicationURL);
81+
applicationName, applicationURL, singleFileSizeLimit);
7982

8083
// Save the bundle to the storage location in properties
8184
Path bundleLocation = bundle.saveToFile(storageLocation);

src/main/java/org/commonwl/viewer/services/WorkflowService.java

Lines changed: 9 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
import org.springframework.stereotype.Service;
3030

3131
import java.io.File;
32-
import java.io.IOException;
3332
import java.util.Calendar;
3433
import java.util.Date;
3534

@@ -43,18 +42,24 @@ public class WorkflowService {
4342
private final ROBundleFactory ROBundleFactory;
4443
private final int cacheDays;
4544
private final String graphvizStorage;
45+
private final int singleFileSizeLimit;
46+
private final int totalFileSizeLimit;
4647

4748
@Autowired
4849
public WorkflowService(GitHubService githubService,
4950
WorkflowRepository workflowRepository,
5051
ROBundleFactory ROBundleFactory,
5152
@Value("${cacheDays}") int cacheDays,
52-
@Value("${graphvizStorage}") String graphvizStorage) {
53+
@Value("${graphvizStorage}") String graphvizStorage,
54+
@Value("${singleFileSizeLimit}") int singleFileSizeLimit,
55+
@Value("${totalFileSizeLimit}") int totalFileSizeLimit) {
5356
this.githubService = githubService;
5457
this.workflowRepository = workflowRepository;
5558
this.ROBundleFactory = ROBundleFactory;
5659
this.cacheDays = cacheDays;
5760
this.graphvizStorage = graphvizStorage;
61+
this.singleFileSizeLimit = singleFileSizeLimit;
62+
this.totalFileSizeLimit = totalFileSizeLimit;
5863
}
5964

6065
/**
@@ -69,7 +74,8 @@ public Workflow newWorkflowFromGithub(GithubDetails githubInfo) {
6974
String latestCommit = githubService.getCommitSha(githubInfo);
7075

7176
// Set up CWL utility to collect the documents
72-
CWLCollection cwlFiles = new CWLCollection(githubService, githubInfo, latestCommit);
77+
CWLCollection cwlFiles = new CWLCollection(githubService, githubInfo, latestCommit,
78+
singleFileSizeLimit, totalFileSizeLimit);
7379

7480
// Get the workflow model
7581
Workflow workflowModel = cwlFiles.getWorkflow();
@@ -170,24 +176,4 @@ public boolean cacheExpired(Workflow workflow) {
170176
return false;
171177
}
172178
}
173-
174-
/*
175-
// Check total file size
176-
int totalFileSize = 0;
177-
for (RepositoryContents repoContent : repoContents) {
178-
totalFileSize += repoContent.getSize();
179-
}
180-
if (totalFileSize > totalFileSizeLimit) {
181-
throw new IOException("Files within the Github directory can not be above "
182-
+ totalFileSizeLimit + "B in size");
183-
}
184-
*/
185-
186-
187-
/*
188-
// Check file size before downloading
189-
if (file.getSize() > singleFileSizeLimit) {
190-
throw new IOException("Files within the Github directory can not be above " + singleFileSizeLimit + "B in size");
191-
}
192-
*/
193179
}

src/main/resources/application.properties

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,12 @@ graphvizStorage = /tmp
1616
cacheDays = 1
1717

1818
# File size limit for individual files in bytes
19-
singleFileSizeLimit = 5000000
19+
# CWL files must be lower than this, but other files in the repo may be lower and in this case will
20+
# be externally linked in the Research Object Bundle
21+
singleFileSizeLimit = 5242880
2022

21-
# File size limit for the contents of the entire Research Object Bundle in bytes
22-
totalFileSizeLimit = 10000000
23+
# File size limit for the contents of the research object bundle (not counting external links)
24+
totalFileSizeLimit = 1073741824
2325

2426
#=======================
2527
# Github API settings

0 commit comments

Comments
 (0)