Skip to content

Commit 3d0ba7e

Browse files
authored
Audio: add support for timestamp_granularities (#374)
1 parent a4877bd commit 3d0ba7e

File tree

8 files changed

+133
-5
lines changed

8 files changed

+133
-5
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ $response = $client->audio()->transcribe([
391391
'model' => 'whisper-1',
392392
'file' => fopen('audio.mp3', 'r'),
393393
'response_format' => 'verbose_json',
394+
'timestamp_granularities' => ['segment', 'word']
394395
]);
395396

396397
$response->task; // 'transcribe'
@@ -412,6 +413,12 @@ foreach ($response->segments as $segment) {
412413
$segment->transient; // false
413414
}
414415

416+
foreach ($response->words as $word) {
417+
$word->word; // 'Hello'
418+
$word->start; // 0.31
419+
$word->end; // 0.92
420+
}
421+
415422
$response->toArray(); // ['task' => 'transcribe', ...]
416423
```
417424

src/Resources/Audio.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ public function transcribe(array $parameters): TranscriptionResponse
5656
{
5757
$payload = Payload::upload('audio/transcriptions', $parameters);
5858

59-
/** @var Response<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, text: string}> $response */
59+
/** @var Response<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, words: array<int, array{word: string, start: float, end: float}>, text: string}> $response */
6060
$response = $this->transporter->requestObject($payload);
6161

6262
return TranscriptionResponse::from($response->data(), $response->meta());

src/Responses/Audio/TranscriptionResponse.php

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212
use OpenAI\Testing\Responses\Concerns\Fakeable;
1313

1414
/**
15-
* @implements ResponseContract<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, text: string}>
15+
* @implements ResponseContract<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, words: array<int, array{word: string, start: float, end: float}>, text: string}>
1616
*/
1717
final class TranscriptionResponse implements ResponseContract, ResponseHasMetaInformationContract
1818
{
1919
/**
20-
* @use ArrayAccessible<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, text: string}>
20+
* @use ArrayAccessible<array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, words: array<int, array{word: string, start: float, end: float}>, text: string}>
2121
*/
2222
use ArrayAccessible;
2323

@@ -26,12 +26,14 @@ final class TranscriptionResponse implements ResponseContract, ResponseHasMetaIn
2626

2727
/**
2828
* @param array<int, TranscriptionResponseSegment> $segments
29+
* @param array<int, TranscriptionResponseWord> $words
2930
*/
3031
private function __construct(
3132
public readonly ?string $task,
3233
public readonly ?string $language,
3334
public readonly ?float $duration,
3435
public readonly array $segments,
36+
public readonly array $words,
3537
public readonly string $text,
3638
private readonly MetaInformation $meta,
3739
) {
@@ -40,7 +42,7 @@ private function __construct(
4042
/**
4143
* Acts as static factory, and returns a new Response instance.
4244
*
43-
* @param array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, text: string}|string $attributes
45+
* @param array{task: ?string, language: ?string, duration: ?float, segments: array<int, array{id: int, seek: int, start: float, end: float, text: string, tokens: array<int, int>, temperature: float, avg_logprob: float, compression_ratio: float, no_speech_prob: float, transient?: bool}>, words: array<int, array{word: string, start: float, end: float}>, text: string}|string $attributes
4446
*/
4547
public static function from(array|string $attributes, MetaInformation $meta): self
4648
{
@@ -52,11 +54,16 @@ public static function from(array|string $attributes, MetaInformation $meta): se
5254
$result
5355
), $attributes['segments']) : [];
5456

57+
$words = isset($attributes['words']) ? array_map(fn (array $result): TranscriptionResponseWord => TranscriptionResponseWord::from(
58+
$result
59+
), $attributes['words']) : [];
60+
5561
return new self(
5662
$attributes['task'] ?? null,
5763
$attributes['language'] ?? null,
5864
$attributes['duration'] ?? null,
5965
$segments,
66+
$words,
6067
$attributes['text'],
6168
$meta,
6269
);
@@ -75,6 +82,10 @@ public function toArray(): array
7582
static fn (TranscriptionResponseSegment $result): array => $result->toArray(),
7683
$this->segments,
7784
),
85+
'words' => array_map(
86+
static fn (TranscriptionResponseWord $result): array => $result->toArray(),
87+
$this->words,
88+
),
7889
'text' => $this->text,
7990
];
8091
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace OpenAI\Responses\Audio;
6+
7+
use OpenAI\Contracts\ResponseContract;
8+
use OpenAI\Responses\Concerns\ArrayAccessible;
9+
10+
/**
11+
* @implements ResponseContract<array{word: string, start: float, end: float}>
12+
*/
13+
final class TranscriptionResponseWord implements ResponseContract
14+
{
15+
/**
16+
* @use ArrayAccessible<array{word: string, start: float, end: float}>
17+
*/
18+
use ArrayAccessible;
19+
20+
private function __construct(
21+
public readonly string $word,
22+
public readonly float $start,
23+
public readonly float $end,
24+
) {
25+
}
26+
27+
/**
28+
* Acts as static factory, and returns a new Response instance.
29+
*
30+
* @param array{word: string, start: float, end: float} $attributes
31+
*/
32+
public static function from(array $attributes): self
33+
{
34+
return new self(
35+
$attributes['word'],
36+
$attributes['start'],
37+
$attributes['end'],
38+
);
39+
}
40+
41+
/**
42+
* {@inheritDoc}
43+
*/
44+
public function toArray(): array
45+
{
46+
return [
47+
'word' => $this->word,
48+
'start' => $this->start,
49+
'end' => $this->end,
50+
];
51+
}
52+
}

src/ValueObjects/Transporter/Payload.php

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,14 +164,22 @@ public function toRequest(BaseUri $baseUri, Headers $headers, QueryParams $query
164164
if ($this->contentType === ContentType::MULTIPART) {
165165
$streamBuilder = new MultipartStreamBuilder($psr17Factory);
166166

167-
/** @var array<string, StreamInterface|string|int|float|bool> $parameters */
167+
/** @var array<string, StreamInterface|string|int|float|bool|array<int, string>> $parameters */
168168
$parameters = $this->parameters;
169169

170170
foreach ($parameters as $key => $value) {
171171
if (is_int($value) || is_float($value) || is_bool($value)) {
172172
$value = (string) $value;
173173
}
174174

175+
if (is_array($value)) {
176+
foreach ($value as $nestedValue) {
177+
$streamBuilder->addResource($key.'[]', $nestedValue);
178+
}
179+
180+
continue;
181+
}
182+
175183
$streamBuilder->addResource($key, $value);
176184
}
177185

tests/Fixtures/Audio.php

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,28 @@ function audioTranscriptionVerboseJson(): array
3333
'transient' => false,
3434
],
3535
],
36+
'words' => [
37+
[
38+
'word' => 'Hello',
39+
'start' => 0.31999999284744,
40+
'end' => 0.9200000166893,
41+
],
42+
[
43+
'word' => 'how',
44+
'start' => 1.0,
45+
'end' => 1.5599999427795,
46+
],
47+
[
48+
'word' => 'are',
49+
'start' => 1.5599999427795,
50+
'end' => 1.8799999952316,
51+
],
52+
[
53+
'word' => 'you',
54+
'start' => 1.8799999952316,
55+
'end' => 2.1600000858307,
56+
],
57+
],
3658
'text' => 'Hello, how are you?',
3759
];
3860
}

tests/Responses/Audio/TranscriptionResponse.php

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
use OpenAI\Responses\Audio\TranscriptionResponse;
44
use OpenAI\Responses\Audio\TranscriptionResponseSegment;
5+
use OpenAI\Responses\Audio\TranscriptionResponseWord;
56
use OpenAI\Responses\Meta\MetaInformation;
67

78
test('from json', function () {
@@ -13,6 +14,7 @@
1314
->language->toBeNull()
1415
->duration->toBeNull()
1516
->segments->toBeEmpty()
17+
->words->toBeEmpty()
1618
->text->toBe('Hello, how are you?')
1719
->meta()->toBeInstanceOf(MetaInformation::class);
1820
});
@@ -28,6 +30,9 @@
2830
->segments->toBeArray()
2931
->segments->toHaveCount(1)
3032
->segments->each->toBeInstanceOf(TranscriptionResponseSegment::class)
33+
->words->toBeArray()
34+
->words->toHaveCount(4)
35+
->words->each->toBeInstanceOf(TranscriptionResponseWord::class)
3136
->text->toBe('Hello, how are you?')
3237
->meta()->toBeInstanceOf(MetaInformation::class);
3338
});
@@ -41,6 +46,7 @@
4146
->language->toBeNull()
4247
->duration->toBeNull()
4348
->segments->toBeEmpty()
49+
->words->toBeEmpty()
4450
->text->toBe('Hello, how are you?')
4551
->meta()->toBeInstanceOf(MetaInformation::class);
4652
});
@@ -54,6 +60,7 @@
5460
->language->toBeNull()
5561
->duration->toBeNull()
5662
->segments->toBeEmpty()
63+
->words->toBeEmpty()
5764
->text->toBe(<<<'SRT'
5865
1
5966
00:00:00,000 --> 00:00:04,000
@@ -73,6 +80,7 @@
7380
->language->toBeNull()
7481
->duration->toBeNull()
7582
->segments->toBeEmpty()
83+
->words->toBeEmpty()
7684
->text->toBe(<<<'VTT'
7785
WEBVTT
7886
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?php
2+
3+
use OpenAI\Responses\Audio\TranscriptionResponseWord;
4+
5+
test('from', function () {
6+
$result = TranscriptionResponseWord::from(audioTranscriptionVerboseJson()['words'][0]);
7+
8+
expect($result)
9+
->toBeInstanceOf(TranscriptionResponseWord::class)
10+
->word->toBe('Hello')
11+
->start->toBe(0.31999999284744)
12+
->end->toBe(0.9200000166893);
13+
});
14+
15+
test('to array', function () {
16+
$result = TranscriptionResponseWord::from(audioTranscriptionVerboseJson()['words'][0]);
17+
18+
expect($result->toArray())
19+
->toBe(audioTranscriptionVerboseJson()['words'][0]);
20+
});

0 commit comments

Comments
 (0)