Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,30 @@
package io.opentelemetry.javadocs;

public class Artifact {
private final String group;
private final String name;
private final String version;

public Artifact(String name, String version) {
public Artifact(String group, String name, String version) {
this.group = group;
this.name = name;
this.version = version;
}

public String getGroup() {
return group;
}

public String getName() {
return name;
}

public String getVersion() {
return version;
}

@Override
public String toString() {
return group + ":" + name + ":" + version;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;

/**
* The javadoc.io site relies on someone accessing the page for an artifact version in order to
Expand All @@ -26,7 +28,16 @@
* pages on the javadoc.io site to trigger updates.
*/
public final class JavaDocsCrawler {
private static final String GROUP = "io.opentelemetry";
// Track list of groups and the minimum artifact versions that should be crawled. Update to the
// latest periodically to avoid crawling artifacts that stopped being published.
private static final Map<String, String> GROUPS_AND_MIN_VERSION =
Map.of(
"io.opentelemetry", "1.49.0",
"io.opentelemetry.instrumentation", "2.15.0",
"io.opentelemetry.contrib", "1.46.0",
"io.opentelemetry.semconv", "1.32.0",
"io.opentelemetry.proto", "1.3.2");

private static final String MAVEN_CENTRAL_BASE_URL =
"https://search.maven.org/solrsearch/select?q=g:";
private static final String JAVA_DOCS_BASE_URL = "https://javadoc.io/doc/";
Expand All @@ -41,23 +52,34 @@ public final class JavaDocsCrawler {

public static void main(String[] args) throws Exception {
HttpClient client = HttpClient.newHttpClient();
List<Artifact> artifacts = getArtifacts(client);
if (artifacts.isEmpty()) {
logger.log(Level.SEVERE, "No artifacts found");
return;
}
logger.info(String.format(Locale.ROOT, "Found %d artifacts", artifacts.size()));

List<String> updated = crawlJavaDocs(client, artifacts);
if (updated.isEmpty()) {
logger.info("No updates were needed");
return;
}
for (Map.Entry<String, String> groupAndMinVersion : GROUPS_AND_MIN_VERSION.entrySet()) {
String group = groupAndMinVersion.getKey();

List<Artifact> artifacts = getArtifacts(client, group);
if (artifacts.isEmpty()) {
logger.log(Level.SEVERE, "No artifacts found for group " + group);
continue;
}
logger.info(
String.format(Locale.ROOT, "Found %d artifacts for group " + group, artifacts.size()));

logger.info("Artifacts that triggered updates:\n" + String.join("\n", updated));
List<Artifact> updated = crawlJavaDocs(client, groupAndMinVersion.getValue(), artifacts);
if (updated.isEmpty()) {
logger.info("No updates were needed for group " + group);
continue;
}

logger.info(
"Artifacts that triggered updates for group "
+ group
+ ":\n"
+ updated.stream().map(Artifact::toString).collect(Collectors.joining("\n")));
}
}

static List<Artifact> getArtifacts(HttpClient client) throws IOException, InterruptedException {
static List<Artifact> getArtifacts(HttpClient client, String group)
throws IOException, InterruptedException {
int start = 0;
Integer numFound;
List<Artifact> result = new ArrayList<>();
Expand All @@ -67,7 +89,7 @@ static List<Artifact> getArtifacts(HttpClient client) throws IOException, Interr
Thread.sleep(THROTTLE_MS); // try not to DDoS the site, it gets knocked over easily
}

Map<?, ?> map = queryMavenCentral(client, start);
Map<?, ?> map = queryMavenCentral(client, group, start);

numFound =
Optional.ofNullable(map)
Expand All @@ -93,26 +115,26 @@ private static List<Artifact> convertToArtifacts(Map<?, ?> map) {
List<Artifact> artifacts = new ArrayList<>();
for (Object doc : docs) {
Map<?, ?> docMap = (Map<?, ?>) doc;
String artifact = (String) docMap.get("a");
String version = (String) docMap.get("latestVersion");
if (artifact != null && version != null) {
artifacts.add(new Artifact(artifact, version));
}
String group = Objects.requireNonNull((String) docMap.get("g"), "g");
String artifact = Objects.requireNonNull((String) docMap.get("a"), "a");
String version =
Objects.requireNonNull((String) docMap.get("latestVersion"), "latestVersion");
artifacts.add(new Artifact(Objects.requireNonNull(group), artifact, version));
}
return artifacts;
})
.orElseGet(ArrayList::new);
}

private static Map<?, ?> queryMavenCentral(HttpClient client, int start)
private static Map<?, ?> queryMavenCentral(HttpClient client, String group, int start)
throws IOException, InterruptedException {
URI uri =
URI.create(
String.format(
Locale.ROOT,
"%s%s&rows=%d&start=%d&wt=json",
MAVEN_CENTRAL_BASE_URL,
GROUP,
group,
PAGE_SIZE,
start));

Expand All @@ -122,21 +144,35 @@ private static List<Artifact> convertToArtifacts(Map<?, ?> map) {
if (response.statusCode() != 200) {
logger.log(
Level.SEVERE,
"Unexpected response code: " + response.statusCode() + ": " + response.body());
"Unexpected response code "
+ response.statusCode()
+ " for uri: "
+ uri.toASCIIString()
+ "\n"
+ response.body());
throw new IOException("Unable to pull Maven central artifacts list");
}
return objectMapper.readValue(response.body(), Map.class);
}

static List<String> crawlJavaDocs(HttpClient client, List<Artifact> artifacts)
static List<Artifact> crawlJavaDocs(
HttpClient client, String minVersion, List<Artifact> artifacts)
throws IOException, InterruptedException {
List<String> updatedArtifacts = new ArrayList<>();
List<Artifact> updatedArtifacts = new ArrayList<>();

for (Artifact artifact : artifacts) {
if (artifact.getVersion().compareTo(minVersion) < 0) {
logger.info(
String.format(
"Skipping crawling %s due to version %s being less than minVersion %s",
artifact, artifact.getVersion(), minVersion));
continue;
}

String[] parts = artifact.getName().split("-");
StringBuilder path = new StringBuilder();
path.append(JAVA_DOCS_BASE_URL)
.append(GROUP)
.append(artifact.getGroup())
.append("/")
.append(artifact.getName())
.append("/")
Expand All @@ -146,6 +182,7 @@ static List<String> crawlJavaDocs(HttpClient client, List<Artifact> artifacts)
.append("/package-summary.html");

HttpRequest crawlRequest = HttpRequest.newBuilder(URI.create(path.toString())).GET().build();
logger.info(String.format("Crawling %s at: %s", artifact, path));
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find extra logging is almost always helpful for scripts

In this case, logging helped me discover that we were scanning a bunch of old artifacts, and not scanning artifacts from other groups (maven must have exact matching on groups, rather than startsWith matching).

HttpResponse<String> crawlResponse =
client.send(crawlRequest, HttpResponse.BodyHandlers.ofString());

Expand All @@ -156,15 +193,15 @@ static List<String> crawlJavaDocs(HttpClient client, List<Artifact> artifacts)
String.format(
Locale.ROOT,
"Crawl failed for %s with status code %d at URL %s\nResponse: %s",
artifact.getName(),
artifact,
crawlResponse.statusCode(),
path,
crawlResponse.body()));
continue;
}

if (crawlResponse.body().contains(JAVA_DOC_DOWNLOADED_TEXT)) {
updatedArtifacts.add(artifact.getName());
updatedArtifacts.add(artifact);
}

Thread.sleep(THROTTLE_MS); // some light throttling
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.util.ArrayList;
import java.util.List;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
Expand All @@ -39,8 +38,8 @@ void testGetArtifactsHandlesPagination() throws IOException, InterruptedExceptio
"response": {
"numFound": 40,
"docs": [
{"a": "artifact1", "latestVersion": "1.0"},
{"a": "artifact2", "latestVersion": "1.1"}
{"g": "group", "a": "artifact1", "latestVersion": "1.0"},
{"g": "group", "a": "artifact2", "latestVersion": "1.1"}
]
}
}
Expand All @@ -51,7 +50,7 @@ void testGetArtifactsHandlesPagination() throws IOException, InterruptedExceptio
"response": {
"numFound": 40,
"docs": [
{"a": "artifact3", "latestVersion": "2.0"}
{"g": "group", "a": "artifact3", "latestVersion": "2.0"}
]
}
}
Expand All @@ -66,7 +65,7 @@ void testGetArtifactsHandlesPagination() throws IOException, InterruptedExceptio
.thenReturn(mockMavenCentralRequest1)
.thenReturn(mockMavenCentralRequest2);

List<Artifact> artifacts = JavaDocsCrawler.getArtifacts(mockClient);
List<Artifact> artifacts = JavaDocsCrawler.getArtifacts(mockClient, "io.opentelemetry");

// 2 calls for the pagination
verify(mockClient, times(2)).send(any(), any());
Expand All @@ -75,22 +74,21 @@ void testGetArtifactsHandlesPagination() throws IOException, InterruptedExceptio

@Test
void testCrawler() throws IOException, InterruptedException {
List<Artifact> artifacts = new ArrayList<>();
artifacts.add(new Artifact("opentelemetry-context", "1.49.0"));
Artifact artifact = new Artifact("io.opentelemetry", "opentelemetry-context", "1.49.0");
ArgumentCaptor<HttpRequest> requestCaptor = ArgumentCaptor.forClass(HttpRequest.class);

when(mockJavaDocResponse.body()).thenReturn(JAVA_DOC_DOWNLOADED_TEXT);
when(mockJavaDocResponse.statusCode()).thenReturn(200);

when(mockClient.send(any(), any())).thenReturn(mockJavaDocResponse);

List<String> updated = JavaDocsCrawler.crawlJavaDocs(mockClient, artifacts);
List<Artifact> updated = JavaDocsCrawler.crawlJavaDocs(mockClient, "1.49.0", List.of(artifact));

verify(mockClient, times(1)).send(requestCaptor.capture(), any());

assertThat(requestCaptor.getValue().uri().toString())
.isEqualTo(
"https://javadoc.io/doc/io.opentelemetry/opentelemetry-context/1.49.0/opentelemetry/context/package-summary.html");
assertThat(updated).containsExactly("opentelemetry-context");
assertThat(updated).containsExactly(artifact);
}
}
Loading