Skip to content

Commit 23860a5

Browse files
codebase/using-amazon-textract-in-spring-boot-to-extract-text-from-images [BAEL-6759] (#17362)
* adding module skeleton * listing module names alphabetically * adding textract configurations * fix: adding region configuration property * adding service layer * incorporate review comments * adding live test * adding textract file type validation
1 parent 5520186 commit 23860a5

File tree

11 files changed

+316
-2
lines changed

11 files changed

+316
-2
lines changed
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0"
3+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
5+
<modelVersion>4.0.0</modelVersion>
6+
<artifactId>amazon-textract</artifactId>
7+
<version>0.0.1</version>
8+
<packaging>jar</packaging>
9+
<name>amazon-textract</name>
10+
<description>codebase demonstrating the integration of Amazon Textract in Spring Boot to extract texts from images</description>
11+
12+
<parent>
13+
<groupId>com.baeldung</groupId>
14+
<artifactId>parent-boot-3</artifactId>
15+
<version>0.0.1-SNAPSHOT</version>
16+
<relativePath>../../parent-boot-3</relativePath>
17+
</parent>
18+
19+
<dependencies>
20+
<dependency>
21+
<groupId>org.springframework.boot</groupId>
22+
<artifactId>spring-boot-starter-web</artifactId>
23+
</dependency>
24+
<dependency>
25+
<groupId>org.springframework.boot</groupId>
26+
<artifactId>spring-boot-starter-validation</artifactId>
27+
</dependency>
28+
<dependency>
29+
<groupId>org.springframework.boot</groupId>
30+
<artifactId>spring-boot-configuration-processor</artifactId>
31+
</dependency>
32+
<dependency>
33+
<groupId>software.amazon.awssdk</groupId>
34+
<artifactId>textract</artifactId>
35+
<version>${amazon-textract.version}</version>
36+
</dependency>
37+
</dependencies>
38+
39+
<properties>
40+
<java.version>17</java.version>
41+
<amazon-textract.version>2.27.5</amazon-textract.version>
42+
</properties>
43+
44+
</project>
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
package com.baeldung.textract;
2+
3+
import org.springframework.boot.SpringApplication;
4+
import org.springframework.boot.autoconfigure.SpringBootApplication;
5+
6+
@SpringBootApplication
7+
public class Application {
8+
9+
public static void main(String[] args) {
10+
SpringApplication.run(Application.class, args);
11+
}
12+
13+
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
package com.baeldung.textract.configuration;
2+
3+
import org.springframework.boot.context.properties.EnableConfigurationProperties;
4+
import org.springframework.context.annotation.Bean;
5+
import org.springframework.context.annotation.Configuration;
6+
7+
import software.amazon.awssdk.auth.credentials.AwsBasicCredentials;
8+
import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
9+
import software.amazon.awssdk.regions.Region;
10+
import software.amazon.awssdk.services.textract.TextractClient;
11+
12+
@Configuration
13+
@EnableConfigurationProperties(AwsConfigurationProperties.class)
14+
public class AmazonTextractConfiguration {
15+
16+
private final AwsConfigurationProperties awsConfigurationProperties;
17+
18+
public AmazonTextractConfiguration(AwsConfigurationProperties awsConfigurationProperties) {
19+
this.awsConfigurationProperties = awsConfigurationProperties;
20+
}
21+
22+
@Bean
23+
public TextractClient textractClient() {
24+
String region = awsConfigurationProperties.getRegion();
25+
String accessKey = awsConfigurationProperties.getAccessKey();
26+
String secretKey = awsConfigurationProperties.getSecretKey();
27+
AwsBasicCredentials awsCredentials = AwsBasicCredentials.create(accessKey, secretKey);
28+
29+
return TextractClient.builder()
30+
.region(Region.of(region))
31+
.credentialsProvider(StaticCredentialsProvider.create(awsCredentials))
32+
.build();
33+
}
34+
35+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package com.baeldung.textract.configuration;
2+
3+
import org.springframework.boot.context.properties.ConfigurationProperties;
4+
import org.springframework.validation.annotation.Validated;
5+
6+
import jakarta.validation.constraints.NotBlank;
7+
8+
@Validated
9+
@ConfigurationProperties(prefix = "com.baeldung.aws")
10+
public class AwsConfigurationProperties {
11+
12+
@NotBlank(message = "AWS region must be configured")
13+
private String region;
14+
15+
@NotBlank(message = "AWS access key must be configured")
16+
private String accessKey;
17+
18+
@NotBlank(message = "AWS secret key must be configured")
19+
private String secretKey;
20+
21+
public String getRegion() {
22+
return region;
23+
}
24+
25+
public void setRegion(String region) {
26+
this.region = region;
27+
}
28+
29+
public String getAccessKey() {
30+
return accessKey;
31+
}
32+
33+
public void setAccessKey(String accessKey) {
34+
this.accessKey = accessKey;
35+
}
36+
37+
public String getSecretKey() {
38+
return secretKey;
39+
}
40+
41+
public void setSecretKey(String secretKey) {
42+
this.secretKey = secretKey;
43+
}
44+
45+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
package com.baeldung.textract.service;
2+
3+
import java.io.IOException;
4+
import java.util.stream.Collectors;
5+
6+
import org.springframework.stereotype.Service;
7+
import org.springframework.validation.annotation.Validated;
8+
import org.springframework.web.multipart.MultipartFile;
9+
10+
import com.baeldung.textract.validation.ValidFileType;
11+
12+
import software.amazon.awssdk.core.SdkBytes;
13+
import software.amazon.awssdk.services.textract.TextractClient;
14+
import software.amazon.awssdk.services.textract.model.Block;
15+
import software.amazon.awssdk.services.textract.model.BlockType;
16+
import software.amazon.awssdk.services.textract.model.DetectDocumentTextResponse;
17+
18+
@Service
19+
@Validated
20+
public class TextExtractor {
21+
22+
private final TextractClient textractClient;
23+
24+
public TextExtractor(TextractClient textractClient) {
25+
this.textractClient = textractClient;
26+
}
27+
28+
public String extract(@ValidFileType MultipartFile image) throws IOException {
29+
byte[] imageBytes = image.getBytes();
30+
DetectDocumentTextResponse response = textractClient.detectDocumentText(request -> request
31+
.document(document -> document
32+
.bytes(SdkBytes.fromByteArray(imageBytes))
33+
.build())
34+
.build());
35+
36+
return transformTextDetectionResponse(response);
37+
}
38+
39+
public String extract(String bucketName, String objectKey) {
40+
DetectDocumentTextResponse response = textractClient.detectDocumentText(request -> request
41+
.document(document -> document
42+
.s3Object(s3Object -> s3Object
43+
.bucket(bucketName)
44+
.name(objectKey)
45+
.build())
46+
.build())
47+
.build());
48+
49+
return transformTextDetectionResponse(response);
50+
}
51+
52+
private String transformTextDetectionResponse(DetectDocumentTextResponse response) {
53+
return response.blocks()
54+
.stream()
55+
.filter(block -> block.blockType().equals(BlockType.LINE))
56+
.map(Block::text)
57+
.collect(Collectors.joining(" "));
58+
}
59+
60+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package com.baeldung.textract.validation;
2+
3+
import java.util.List;
4+
5+
import org.springframework.web.multipart.MultipartFile;
6+
7+
import jakarta.validation.ConstraintValidator;
8+
import jakarta.validation.ConstraintValidatorContext;
9+
10+
public class TextractFileValidator implements ConstraintValidator<ValidFileType, MultipartFile> {
11+
12+
private static final List<String> VALID_CONTENT_TYPES = List.of(
13+
"image/png",
14+
"image/jpeg",
15+
"image/tiff",
16+
"application/pdf");
17+
18+
@Override
19+
public boolean isValid(MultipartFile file, ConstraintValidatorContext context) {
20+
if (file == null || file.isEmpty()) {
21+
return false;
22+
}
23+
return VALID_CONTENT_TYPES.contains(file.getContentType());
24+
}
25+
26+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package com.baeldung.textract.validation;
2+
3+
import java.lang.annotation.Documented;
4+
import java.lang.annotation.ElementType;
5+
import java.lang.annotation.Retention;
6+
import java.lang.annotation.RetentionPolicy;
7+
import java.lang.annotation.Target;
8+
9+
import jakarta.validation.Constraint;
10+
import jakarta.validation.Payload;
11+
12+
@Documented
13+
@Target(ElementType.PARAMETER)
14+
@Retention(RetentionPolicy.RUNTIME)
15+
@Constraint(validatedBy = TextractFileValidator.class)
16+
public @interface ValidFileType {
17+
18+
String message() default "Invalid file type. Allowed types are PNG, JPEG, TIFF, and PDF.";
19+
20+
Class<?>[] groups() default {};
21+
22+
Class<? extends Payload>[] payload() default {};
23+
24+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
com:
2+
baeldung:
3+
aws:
4+
region: ${AWS_REGION}
5+
access-key: ${AWS_ACCESS_KEY}
6+
secret-key: ${AWS_SECRET_KEY}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
package com.baeldung.textract.service;
2+
3+
import static org.assertj.core.api.Assertions.assertThat;
4+
import static org.assertj.core.api.Assertions.assertThatThrownBy;
5+
6+
import java.io.ByteArrayInputStream;
7+
import java.io.IOException;
8+
import java.nio.file.Files;
9+
import java.nio.file.Path;
10+
11+
import org.junit.jupiter.api.Test;
12+
import org.springframework.beans.factory.annotation.Autowired;
13+
import org.springframework.boot.test.context.SpringBootTest;
14+
import org.springframework.core.io.ClassPathResource;
15+
import org.springframework.mock.web.MockMultipartFile;
16+
import org.springframework.web.multipart.MultipartFile;
17+
18+
import jakarta.validation.ConstraintViolationException;
19+
import net.bytebuddy.utility.RandomString;
20+
21+
@SpringBootTest
22+
class TextExtractorLiveTest {
23+
24+
@Autowired
25+
private TextExtractor textExtractor;
26+
27+
@Test
28+
void whenTextractCalledWithImage_thenCorrectTextExtracted() throws IOException {
29+
String fileName = "sample-image.png";
30+
Path filePath = new ClassPathResource(fileName).getFile().toPath();
31+
ByteArrayInputStream fileContent = new ByteArrayInputStream(Files.readAllBytes(filePath));
32+
MultipartFile file = new MockMultipartFile(fileName, fileName, "image/png", fileContent);
33+
34+
String response = textExtractor.extract(file);
35+
36+
assertThat(response).isEqualTo("Exploring Amazon Textract");
37+
}
38+
39+
@Test
40+
void whenTextractCalledWithS3Object_thenCorrectTextExtracted() {
41+
String bucketName = "baeldung-amazon-textract-tutorial-bucket";
42+
String objectKey = "sample-image.png";
43+
44+
String response = textExtractor.extract(bucketName, objectKey);
45+
46+
assertThat(response).isEqualTo("Exploring Amazon Textract");
47+
}
48+
49+
@Test
50+
void whenInvalidFileTypeProvided_thenConstraintViolationExceptionThrown() throws IOException {
51+
String fileName = "invalid-file.txt";
52+
ByteArrayInputStream fileContent = new ByteArrayInputStream(RandomString.make().getBytes());
53+
MultipartFile invalidFile = new MockMultipartFile(fileName, fileName, "text/plain", fileContent);
54+
55+
assertThatThrownBy(() -> textExtractor.extract(invalidFile))
56+
.isInstanceOf(ConstraintViolationException.class)
57+
.hasMessageContaining("Invalid file type. Allowed types are PNG, JPEG, TIFF, and PDF.");
58+
}
59+
60+
}
93.6 KB
Loading

0 commit comments

Comments
 (0)