Skip to content

Commit 1c850b5

Browse files
committed
Extract UnifiedDiffSink
1 parent 202bc6d commit 1c850b5

File tree

3 files changed

+298
-38
lines changed

3 files changed

+298
-38
lines changed

gix-diff/src/blob/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ pub mod pipeline;
1212
pub mod platform;
1313

1414
pub mod unified_diff;
15-
pub use unified_diff::_impl::UnifiedDiff;
15+
pub use unified_diff::_impl::{UnifiedDiff, UnifiedDiffSink};
1616

1717
/// Information about the diff performed to detect similarity.
1818
#[derive(Debug, Default, Clone, Copy, PartialEq, PartialOrd)]

gix-diff/src/blob/unified_diff.rs

Lines changed: 239 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,35 @@ impl ContextSize {
2626
}
2727
}
2828

29+
/// Represents the type of a line in a unified diff.
30+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31+
pub enum DiffLineType {
32+
/// A line that exists in both old and new versions (context line).
33+
Context,
34+
/// A line that was added in the new version.
35+
Add,
36+
/// A line that was removed from the old version.
37+
Remove,
38+
}
39+
40+
impl DiffLineType {
41+
fn to_prefix(self) -> char {
42+
match self {
43+
DiffLineType::Context => ' ',
44+
DiffLineType::Add => '+',
45+
DiffLineType::Remove => '-',
46+
}
47+
}
48+
49+
fn to_byte_prefix(self) -> u8 {
50+
match self {
51+
DiffLineType::Context => b' ',
52+
DiffLineType::Add => b'+',
53+
DiffLineType::Remove => b'-',
54+
}
55+
}
56+
}
57+
2958
/// Specify where to put a newline.
3059
#[derive(Debug, Copy, Clone)]
3160
pub enum NewlineSeparator<'a> {
@@ -39,6 +68,31 @@ pub enum NewlineSeparator<'a> {
3968
AfterHeaderAndWhenNeeded(&'a str),
4069
}
4170

71+
/// A utility trait for use in [`UnifiedDiffSink`](super::UnifiedDiffSink).
72+
pub trait ConsumeTypedHunk {
73+
/// TODO:
74+
/// Document.
75+
type Out;
76+
77+
/// TODO:
78+
/// Document.
79+
/// How do we want to pass the header to `consume_hunk`? We can add an additional parameter
80+
/// similar to `ConsumeHunk::consume_hunk` or add `DiffLineType::Header` in which case we
81+
/// didn’t have to add an additional parameter.
82+
fn consume_hunk(
83+
&mut self,
84+
before_hunk_start: u32,
85+
before_hunk_len: u32,
86+
after_hunk_start: u32,
87+
after_hunk_len: u32,
88+
header: &str,
89+
lines: &[(DiffLineType, &[u8])],
90+
) -> std::io::Result<()>;
91+
92+
/// Called when processing is complete.
93+
fn finish(self) -> Self::Out;
94+
}
95+
4296
/// A utility trait for use in [`UnifiedDiff`](super::UnifiedDiff).
4397
pub trait ConsumeHunk {
4498
/// The item this instance produces after consuming all hunks.
@@ -75,18 +129,13 @@ pub(super) mod _impl {
75129
use imara_diff::{intern, Sink};
76130
use intern::{InternedInput, Interner, Token};
77131

78-
use super::{ConsumeHunk, ContextSize, NewlineSeparator};
79-
80-
const CONTEXT: char = ' ';
81-
const ADDITION: char = '+';
82-
const REMOVAL: char = '-';
132+
use super::{ConsumeHunk, ConsumeTypedHunk, ContextSize, DiffLineType, NewlineSeparator};
83133

84-
/// A [`Sink`] that creates a textual diff in the format typically output by git or `gnu-diff` if the `-u` option is used,
85-
/// and passes it in full to a consumer.
86-
pub struct UnifiedDiff<'a, T, D>
134+
/// A [`Sink`] that creates a unified diff and processes it hunk-by-hunk with structured type information.
135+
pub struct UnifiedDiffSink<'a, T, D>
87136
where
88137
T: Hash + Eq + AsRef<[u8]>,
89-
D: ConsumeHunk,
138+
D: ConsumeTypedHunk,
90139
{
91140
before: &'a [Token],
92141
after: &'a [Token],
@@ -106,26 +155,31 @@ pub(super) mod _impl {
106155

107156
/// Symmetrical context before and after the changed hunk.
108157
ctx_size: u32,
158+
// TODO:
159+
// Is there a way to remove `newline` from `UnifiedDiffSink` as it is purely
160+
// formatting-related?
161+
// One option would be to introduce `HunkHeader` with a method `format_header` that could
162+
// then be called outside `UnifiedDiffSink`, potentially taking `newline` as an argument.
109163
newline: NewlineSeparator<'a>,
110164

111-
buffer: Vec<u8>,
165+
buffer: Vec<(DiffLineType, Vec<u8>)>,
112166
header_buf: String,
113167
delegate: D,
114168

115169
err: Option<std::io::Error>,
116170
}
117171

118-
impl<'a, T, D> UnifiedDiff<'a, T, D>
172+
impl<'a, T, D> UnifiedDiffSink<'a, T, D>
119173
where
120174
T: Hash + Eq + AsRef<[u8]>,
121-
D: ConsumeHunk,
175+
D: ConsumeTypedHunk,
122176
{
123-
/// Create a new instance to create unified diff using the lines in `input`,
177+
/// Create a new instance to create a unified diff using the lines in `input`,
124178
/// which also must be used when running the diff algorithm.
125179
/// `context_size` is the amount of lines around each hunk which will be passed
126-
///to `consume_hunk`.
180+
/// to the sink.
127181
///
128-
/// `consume_hunk` is called for each hunk in unified-diff format, as created from each line separated by `newline_separator`.
182+
/// The sink's `consume_hunk` method is called for each hunk with structured type information.
129183
pub fn new(
130184
input: &'a InternedInput<T>,
131185
consume_hunk: D,
@@ -154,21 +208,10 @@ pub(super) mod _impl {
154208
}
155209
}
156210

157-
fn print_tokens(&mut self, tokens: &[Token], prefix: char) {
211+
fn print_tokens(&mut self, tokens: &[Token], line_type: DiffLineType) {
158212
for &token in tokens {
159-
self.buffer.push_char(prefix);
160-
let line = &self.interner[token];
161-
self.buffer.push_str(line);
162-
match self.newline {
163-
NewlineSeparator::AfterHeaderAndLine(nl) => {
164-
self.buffer.push_str(nl);
165-
}
166-
NewlineSeparator::AfterHeaderAndWhenNeeded(nl) => {
167-
if !line.as_ref().ends_with_str(nl) {
168-
self.buffer.push_str(nl);
169-
}
170-
}
171-
}
213+
let content = self.interner[token].as_ref().to_vec();
214+
self.buffer.push((line_type, content));
172215
}
173216
}
174217

@@ -200,21 +243,36 @@ pub(super) mod _impl {
200243
),
201244
)
202245
.map_err(|err| std::io::Error::new(ErrorKind::Other, err))?;
246+
247+
// TODO:
248+
// Is this explicit conversion necessary?
249+
// Is the comment necessary?
250+
// Convert Vec<(DiffLineType, Vec<u8>)> to Vec<(DiffLineType, &[u8])>
251+
let lines: Vec<(DiffLineType, &[u8])> = self
252+
.buffer
253+
.iter()
254+
.map(|(line_type, content)| (*line_type, content.as_slice()))
255+
.collect();
256+
203257
self.delegate.consume_hunk(
204258
hunk_start,
205259
self.before_hunk_len,
206260
hunk_end,
207261
self.after_hunk_len,
208262
&self.header_buf,
209-
&self.buffer,
263+
&lines,
210264
)?;
211265

212266
self.reset_hunks();
213267
Ok(())
214268
}
215269

216270
fn print_context_and_update_pos(&mut self, print: Range<u32>, move_to: u32) {
217-
self.print_tokens(&self.before[print.start as usize..print.end as usize], CONTEXT);
271+
self.print_tokens(
272+
&self.before[print.start as usize..print.end as usize],
273+
DiffLineType::Context,
274+
);
275+
218276
let len = print.end - print.start;
219277
self.ctx_pos = Some(move_to);
220278
self.before_hunk_len += len;
@@ -232,10 +290,10 @@ pub(super) mod _impl {
232290
}
233291
}
234292

235-
impl<T, D> Sink for UnifiedDiff<'_, T, D>
293+
impl<T, D> Sink for UnifiedDiffSink<'_, T, D>
236294
where
237295
T: Hash + Eq + AsRef<[u8]>,
238-
D: ConsumeHunk,
296+
D: ConsumeTypedHunk,
239297
{
240298
type Out = std::io::Result<D::Out>;
241299

@@ -270,8 +328,11 @@ pub(super) mod _impl {
270328
self.before_hunk_len += before.end - before.start;
271329
self.after_hunk_len += after.end - after.start;
272330

273-
self.print_tokens(&self.before[before.start as usize..before.end as usize], REMOVAL);
274-
self.print_tokens(&self.after[after.start as usize..after.end as usize], ADDITION);
331+
self.print_tokens(
332+
&self.before[before.start as usize..before.end as usize],
333+
DiffLineType::Remove,
334+
);
335+
self.print_tokens(&self.after[after.start as usize..after.end as usize], DiffLineType::Add);
275336
}
276337

277338
fn finish(mut self) -> Self::Out {
@@ -285,6 +346,95 @@ pub(super) mod _impl {
285346
}
286347
}
287348

349+
/// A [`Sink`] that creates a textual diff in the format typically output by git or `gnu-diff` if the `-u` option is used,
350+
/// and passes it in full to a consumer.
351+
pub struct UnifiedDiff<'a, D>
352+
where
353+
D: ConsumeHunk,
354+
{
355+
delegate: D,
356+
newline: NewlineSeparator<'a>,
357+
buffer: Vec<u8>,
358+
}
359+
360+
impl<'a, D> UnifiedDiff<'a, D>
361+
where
362+
D: ConsumeHunk,
363+
{
364+
/// Create a new instance to create a unified diff using the lines in `input`,
365+
/// which also must be used when running the diff algorithm.
366+
/// `context_size` is the amount of lines around each hunk which will be passed
367+
/// to `consume_hunk`.
368+
///
369+
/// `consume_hunk` is called for each hunk in unified-diff format, as created from each line separated by `newline_separator`.
370+
pub fn new<T>(
371+
input: &'a InternedInput<T>,
372+
consume_hunk: D,
373+
newline_separator: NewlineSeparator<'a>,
374+
context_size: ContextSize,
375+
) -> UnifiedDiffSink<'a, T, Self>
376+
where
377+
T: Hash + Eq + AsRef<[u8]>,
378+
{
379+
let formatter = Self {
380+
delegate: consume_hunk,
381+
newline: newline_separator,
382+
buffer: Vec::new(),
383+
};
384+
// TODO:
385+
// Should this return a `UnifiedDiff` instead of a `UnifiedDiffSink`?
386+
UnifiedDiffSink::new(input, formatter, newline_separator, context_size)
387+
}
388+
389+
fn format_line(&mut self, line_type: DiffLineType, content: &[u8]) {
390+
self.buffer.push(line_type.to_byte_prefix());
391+
self.buffer.push_str(content);
392+
match self.newline {
393+
NewlineSeparator::AfterHeaderAndLine(nl) => {
394+
self.buffer.push_str(nl);
395+
}
396+
NewlineSeparator::AfterHeaderAndWhenNeeded(nl) => {
397+
if !content.ends_with_str(nl) {
398+
self.buffer.push_str(nl);
399+
}
400+
}
401+
}
402+
}
403+
}
404+
405+
impl<D: ConsumeHunk> ConsumeTypedHunk for UnifiedDiff<'_, D> {
406+
type Out = D::Out;
407+
408+
fn consume_hunk(
409+
&mut self,
410+
before_hunk_start: u32,
411+
before_hunk_len: u32,
412+
after_hunk_start: u32,
413+
after_hunk_len: u32,
414+
header: &str,
415+
lines: &[(DiffLineType, &[u8])],
416+
) -> std::io::Result<()> {
417+
self.buffer.clear();
418+
419+
for &(line_type, content) in lines {
420+
self.format_line(line_type, content);
421+
}
422+
423+
self.delegate.consume_hunk(
424+
before_hunk_start,
425+
before_hunk_len,
426+
after_hunk_start,
427+
after_hunk_len,
428+
&header,
429+
&self.buffer,
430+
)
431+
}
432+
433+
fn finish(self) -> Self::Out {
434+
self.delegate.finish()
435+
}
436+
}
437+
288438
/// An implementation that fails if the input isn't UTF-8.
289439
impl ConsumeHunk for String {
290440
type Out = Self;
@@ -317,4 +467,58 @@ pub(super) mod _impl {
317467
self
318468
}
319469
}
470+
471+
impl ConsumeTypedHunk for String {
472+
type Out = Self;
473+
474+
fn consume_hunk(
475+
&mut self,
476+
_: u32,
477+
_: u32,
478+
_: u32,
479+
_: u32,
480+
header: &str,
481+
lines: &[(DiffLineType, &[u8])],
482+
) -> std::io::Result<()> {
483+
self.push_str(header);
484+
for &(line_type, content) in lines {
485+
self.push(line_type.to_prefix());
486+
// TODO:
487+
// How does `impl ConsumeHunk for String` handle errors?
488+
self.push_str(std::str::from_utf8(content).map_err(|e| std::io::Error::new(ErrorKind::Other, e))?);
489+
self.push('\n');
490+
}
491+
Ok(())
492+
}
493+
494+
fn finish(self) -> Self::Out {
495+
self
496+
}
497+
}
498+
499+
impl ConsumeTypedHunk for Vec<u8> {
500+
type Out = Self;
501+
502+
fn consume_hunk(
503+
&mut self,
504+
_: u32,
505+
_: u32,
506+
_: u32,
507+
_: u32,
508+
header: &str,
509+
lines: &[(DiffLineType, &[u8])],
510+
) -> std::io::Result<()> {
511+
self.push_str(header);
512+
for &(line_type, content) in lines {
513+
self.push(line_type.to_byte_prefix());
514+
self.extend_from_slice(content);
515+
self.push(b'\n');
516+
}
517+
Ok(())
518+
}
519+
520+
fn finish(self) -> Self::Out {
521+
self
522+
}
523+
}
320524
}

0 commit comments

Comments
 (0)