|
1 | 1 | # BioJava-Spark |
2 | | -Algorithms that are built around BioJava and are running on Apache Spark |
3 | 2 |
|
| 3 | +Algorithms that are built around BioJava and are running on Apache Spark |
4 | 4 |
|
5 | 5 | [](https://travis-ci.org/biojava/biojava-spark) |
6 | 6 | [](https://github.com/biojava/biojava/blob/master/LICENSE) |
7 | 7 | [](https://github.com/biojava/biojava-spark) |
8 | 8 | [](https://github.com/biojava/biojava-spark/) |
9 | 9 |
|
10 | | -# Starting up |
| 10 | +## Starting up |
11 | 11 |
|
12 | 12 | ### Some initial instructions can be found on the mmtf-spark project |
13 | | -https://github.com/rcsb/mmtf-spark |
| 13 | +https://github.com/sbl-sdsc/mmtf-spark |
| 14 | + |
14 | 15 | ## First download and untar a Hadoop sequence file of the PDB (~7 GB download) |
| 16 | + |
15 | 17 | ```bash |
16 | 18 | wget http://mmtf.rcsb.org/v1.0/hadoopfiles/full.tar |
17 | 19 | tar -xvf full.tar |
18 | 20 | ``` |
19 | 21 | Or you can get a C-alpha, phosphate, ligand only version (~800 Mb download) |
| 22 | + |
20 | 23 | ```bash |
21 | 24 | wget http://mmtf.rcsb.org/v1.0/hadoopfiles/reduced.tar |
22 | 25 | tar -xvf reduced.tar |
23 | 26 | ``` |
24 | 27 | ### Second add the biojava-spark dependecy to your pom |
25 | 28 |
|
26 | 29 | ```xml |
27 | | - <dependency> |
28 | | - <groupId>org.biojava</groupId> |
29 | | - <artifactId>biojava-spark</artifactId> |
30 | | - <version>0.2.1</version> |
31 | | - </dependency> |
| 30 | +<dependency> |
| 31 | + <groupId>org.biojava</groupId> |
| 32 | + <artifactId>biojava-spark</artifactId> |
| 33 | + <version>0.2.1</version> |
| 34 | +</dependency> |
32 | 35 | ``` |
33 | 36 |
|
34 | | - |
35 | | - |
36 | 37 | ## Extra Biojava examples |
37 | 38 |
|
38 | 39 | ### Do some simple quality filtering |
39 | 40 |
|
40 | 41 | ```java |
41 | | - float maxResolution = 3.0f; |
42 | | - float maxRfree = 0.3f; |
43 | | - StructureDataRDD structureData = new StructureDataRDD("/path/to/file") |
44 | | - .filterResolution(maxResolution) |
45 | | - .filterRfree(maxRfree); |
| 42 | +float maxResolution = 3.0f; |
| 43 | +float maxRfree = 0.3f; |
| 44 | +StructureDataRDD structureData = new StructureDataRDD("/path/to/file") |
| 45 | + .filterResolution(maxResolution) |
| 46 | + .filterRfree(maxRfree); |
46 | 47 | ``` |
47 | 48 |
|
48 | 49 | ### Summarsing the elements in the PDB |
| 50 | + |
49 | 51 | ```java |
50 | | - Map<String, Long> elementCountMap = BiojavaSparkUtils.findAtoms(structureData).countByElement(); |
| 52 | +Map<String, Long> elementCountMap = BiojavaSparkUtils.findAtoms(structureData).countByElement(); |
51 | 53 | ``` |
52 | 54 |
|
53 | 55 | ### Finding inter-atomic contacts from the PDB |
54 | 56 |
|
55 | 57 | ```java |
56 | | - Double mean = BiojavaSparkUtils.findContacts(structureData, |
57 | | - new AtomSelectObject() |
58 | | - .groupNameList(new String[] {"PRO","LYS"}) |
59 | | - .elementNameList(new String[] {"C"}) |
60 | | - .atomNameList(new String[] {"CA"}), |
61 | | - cutoff) |
62 | | - .getDistanceDistOfAtomInts("CA", "CA") |
63 | | - .mean(); |
64 | | - System.out.println("\nMean PRO-LYS CA-CA distance: "+mean); |
| 58 | +Double mean = BiojavaSparkUtils.findContacts(structureData, |
| 59 | + new AtomSelectObject() |
| 60 | + .groupNameList(new String[] {"PRO","LYS"}) |
| 61 | + .elementNameList(new String[] {"C"}) |
| 62 | + .atomNameList(new String[] {"CA"}), |
| 63 | + cutoff) |
| 64 | + .getDistanceDistOfAtomInts("CA", "CA") |
| 65 | + .mean(); |
| 66 | +System.out.println("\nMean PRO-LYS CA-CA distance: " + mean); |
65 | 67 | ``` |
66 | | - |
|
0 commit comments