@@ -15,6 +15,18 @@ static bool g_cpu = registerDeviceInterface(
15
15
16
16
} // namespace
17
17
18
+ bool CpuDeviceInterface::SwsFrameContext::operator ==(
19
+ const CpuDeviceInterface::SwsFrameContext& other) const {
20
+ return inputWidth == other.inputWidth && inputHeight == other.inputHeight &&
21
+ inputFormat == other.inputFormat && outputWidth == other.outputWidth &&
22
+ outputHeight == other.outputHeight ;
23
+ }
24
+
25
+ bool CpuDeviceInterface::SwsFrameContext::operator !=(
26
+ const CpuDeviceInterface::SwsFrameContext& other) const {
27
+ return !(*this == other);
28
+ }
29
+
18
30
CpuDeviceInterface::CpuDeviceInterface (const torch::Device& device)
19
31
: DeviceInterface(device) {
20
32
TORCH_CHECK (g_cpu, " CpuDeviceInterface was not registered!" );
@@ -56,31 +68,8 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
56
68
}
57
69
58
70
torch::Tensor outputTensor;
59
- // We need to compare the current frame context with our previous frame
60
- // context. If they are different, then we need to re-create our colorspace
61
- // conversion objects. We create our colorspace conversion objects late so
62
- // that we don't have to depend on the unreliable metadata in the header.
63
- // And we sometimes re-create them because it's possible for frame
64
- // resolution to change mid-stream. Finally, we want to reuse the colorspace
65
- // conversion objects as much as possible for performance reasons.
66
71
enum AVPixelFormat frameFormat =
67
72
static_cast <enum AVPixelFormat>(avFrame->format );
68
- FiltersContext filtersContext;
69
-
70
- filtersContext.inputWidth = avFrame->width ;
71
- filtersContext.inputHeight = avFrame->height ;
72
- filtersContext.inputFormat = frameFormat;
73
- filtersContext.inputAspectRatio = avFrame->sample_aspect_ratio ;
74
- filtersContext.outputWidth = expectedOutputWidth;
75
- filtersContext.outputHeight = expectedOutputHeight;
76
- filtersContext.outputFormat = AV_PIX_FMT_RGB24;
77
- filtersContext.timeBase = timeBase;
78
-
79
- std::stringstream filters;
80
- filters << " scale=" << expectedOutputWidth << " :" << expectedOutputHeight;
81
- filters << " :sws_flags=bilinear" ;
82
-
83
- filtersContext.filters = filters.str ();
84
73
85
74
// By default, we want to use swscale for color conversion because it is
86
75
// faster. However, it has width requirements, so we may need to fall back
@@ -101,12 +90,27 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
101
90
videoStreamOptions.colorConversionLibrary .value_or (defaultLibrary);
102
91
103
92
if (colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
93
+ // We need to compare the current frame context with our previous frame
94
+ // context. If they are different, then we need to re-create our colorspace
95
+ // conversion objects. We create our colorspace conversion objects late so
96
+ // that we don't have to depend on the unreliable metadata in the header.
97
+ // And we sometimes re-create them because it's possible for frame
98
+ // resolution to change mid-stream. Finally, we want to reuse the colorspace
99
+ // conversion objects as much as possible for performance reasons.
100
+ SwsFrameContext swsFrameContext;
101
+
102
+ swsFrameContext.inputWidth = avFrame->width ;
103
+ swsFrameContext.inputHeight = avFrame->height ;
104
+ swsFrameContext.inputFormat = frameFormat;
105
+ swsFrameContext.outputWidth = expectedOutputWidth;
106
+ swsFrameContext.outputHeight = expectedOutputHeight;
107
+
104
108
outputTensor = preAllocatedOutputTensor.value_or (allocateEmptyHWCTensor (
105
109
expectedOutputHeight, expectedOutputWidth, torch::kCPU ));
106
110
107
- if (!swsContext_ || prevFiltersContext_ != filtersContext ) {
108
- createSwsContext (filtersContext , avFrame->colorspace );
109
- prevFiltersContext_ = std::move (filtersContext) ;
111
+ if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext ) {
112
+ createSwsContext (swsFrameContext , avFrame->colorspace );
113
+ prevSwsFrameContext_ = swsFrameContext ;
110
114
}
111
115
int resultHeight =
112
116
convertAVFrameToTensorUsingSwsScale (avFrame, outputTensor);
@@ -122,6 +126,23 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
122
126
123
127
frameOutput.data = outputTensor;
124
128
} else if (colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
129
+ FiltersContext filtersContext;
130
+
131
+ filtersContext.inputWidth = avFrame->width ;
132
+ filtersContext.inputHeight = avFrame->height ;
133
+ filtersContext.inputFormat = frameFormat;
134
+ filtersContext.inputAspectRatio = avFrame->sample_aspect_ratio ;
135
+ filtersContext.outputWidth = expectedOutputWidth;
136
+ filtersContext.outputHeight = expectedOutputHeight;
137
+ filtersContext.outputFormat = AV_PIX_FMT_RGB24;
138
+ filtersContext.timeBase = timeBase;
139
+
140
+ std::stringstream filters;
141
+ filters << " scale=" << expectedOutputWidth << " :" << expectedOutputHeight;
142
+ filters << " :sws_flags=bilinear" ;
143
+
144
+ filtersContext.filtergraphStr = filters.str ();
145
+
125
146
if (!filterGraphContext_ || prevFiltersContext_ != filtersContext) {
126
147
filterGraphContext_ =
127
148
std::make_unique<FilterGraph>(filtersContext, videoStreamOptions);
@@ -196,15 +217,15 @@ torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
196
217
}
197
218
198
219
void CpuDeviceInterface::createSwsContext (
199
- const FiltersContext& filtersContext ,
220
+ const SwsFrameContext& swsFrameContext ,
200
221
const enum AVColorSpace colorspace) {
201
222
SwsContext* swsContext = sws_getContext (
202
- filtersContext .inputWidth ,
203
- filtersContext .inputHeight ,
204
- filtersContext .inputFormat ,
205
- filtersContext .outputWidth ,
206
- filtersContext .outputHeight ,
207
- filtersContext. outputFormat ,
223
+ swsFrameContext .inputWidth ,
224
+ swsFrameContext .inputHeight ,
225
+ swsFrameContext .inputFormat ,
226
+ swsFrameContext .outputWidth ,
227
+ swsFrameContext .outputHeight ,
228
+ AV_PIX_FMT_RGB24 ,
208
229
SWS_BILINEAR,
209
230
nullptr ,
210
231
nullptr ,
0 commit comments