Skip to content

Commit 466fab8

Browse files
committed
Add missing Term Vector Filters
Closes #2210
1 parent 221cf92 commit 466fab8

File tree

3 files changed

+154
-7
lines changed

3 files changed

+154
-7
lines changed

src/Nest/Document/Single/TermVectors/TermVectorFilter.cs

Lines changed: 109 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,41 +4,146 @@
44

55
namespace Nest
66
{
7+
/// <summary>
8+
/// Filter terms returned based on their TF-IDF scores.
9+
/// This can be useful in order find out a good characteristic vector of a document.
10+
/// </summary>
711
[JsonObject(MemberSerialization.OptIn)]
812
public interface ITermVectorFilter
913
{
14+
/// <summary>
15+
/// Maximum number of terms that must be returned per field. Defaults to 25.
16+
/// </summary>
1017
[JsonProperty("max_num_terms")]
1118
int? MaximumNumberOfTerms { get; set; }
1219

20+
/// <summary>
21+
/// Ignore words with less than this frequency in the source doc. Defaults to 1.
22+
/// </summary>
1323
[JsonProperty("min_term_freq")]
1424
int? MinimumTermFrequency { get; set; }
1525

26+
/// <summary>
27+
/// Ignore words with more than this frequency in the source doc. Defaults to unbounded.
28+
/// </summary>
29+
[JsonProperty("max_term_freq")]
30+
int? MaximumTermFrequency { get; set; }
31+
32+
/// <summary>
33+
/// Ignore terms which do not occur in at least this many docs. Defaults to 1.
34+
/// </summary>
1635
[JsonProperty("min_doc_freq")]
1736
int? MinimumDocumentFrequency { get; set; }
37+
38+
/// <summary>
39+
/// Ignore words which occur in more than this many docs. Defaults to unbounded.
40+
/// </summary>
41+
[JsonProperty("max_doc_freq")]
42+
int? MaximumDocumentFrequency { get; set; }
43+
44+
/// <summary>
45+
/// The minimum word length below which words will be ignored. Defaults to 0.
46+
/// </summary>
47+
[JsonProperty("min_word_length")]
48+
int? MinimumWordLength { get; set; }
49+
50+
/// <summary>
51+
/// The maximum word length above which words will be ignored. Defaults to unbounded.
52+
/// </summary>
53+
[JsonProperty("max_word_length")]
54+
int? MaximumWordLength { get; set; }
1855
}
1956

57+
/// <summary>
58+
/// Filter terms returned based on their TF-IDF scores.
59+
/// This can be useful in order find out a good characteristic vector of a document.
60+
/// </summary>
2061
public class TermVectorFilter : ITermVectorFilter
2162
{
63+
/// <summary>
64+
/// Maximum number of terms that must be returned per field. Defaults to 25.
65+
/// </summary>
2266
public int? MaximumNumberOfTerms { get; set; }
2367

68+
/// <summary>
69+
/// Ignore words with less than this frequency in the source doc. Defaults to 1.
70+
/// </summary>
2471
public int? MinimumTermFrequency { get; set; }
2572

73+
/// <summary>
74+
/// Ignore words with more than this frequency in the source doc. Defaults to unbounded.
75+
/// </summary>
76+
public int? MaximumTermFrequency { get; set; }
77+
78+
/// <summary>
79+
/// Ignore terms which do not occur in at least this many docs. Defaults to 1.
80+
/// </summary>
2681
public int? MinimumDocumentFrequency { get; set; }
82+
83+
/// <summary>
84+
/// Ignore words which occur in more than this many docs. Defaults to unbounded.
85+
/// </summary>
86+
public int? MaximumDocumentFrequency { get; set; }
87+
88+
/// <summary>
89+
/// The minimum word length below which words will be ignored. Defaults to 0.
90+
/// </summary>
91+
public int? MinimumWordLength { get; set; }
92+
93+
/// <summary>
94+
/// The maximum word length above which words will be ignored. Defaults to unbounded.
95+
/// </summary>
96+
public int? MaximumWordLength { get; set; }
2797
}
2898

99+
/// <summary>
100+
/// Filter terms returned based on their TF-IDF scores.
101+
/// This can be useful in order find out a good characteristic vector of a document.
102+
/// </summary>
29103
public class TermVectorFilterDescriptor
30104
: DescriptorBase<TermVectorFilterDescriptor, ITermVectorFilter>, ITermVectorFilter
31105
{
32106
int? ITermVectorFilter.MaximumNumberOfTerms { get; set; }
33-
34-
int? ITermVectorFilter.MinimumDocumentFrequency { get; set; }
35-
36107
int? ITermVectorFilter.MinimumTermFrequency { get; set; }
108+
int? ITermVectorFilter.MaximumTermFrequency { get; set; }
109+
int? ITermVectorFilter.MinimumDocumentFrequency { get; set; }
110+
int? ITermVectorFilter.MaximumDocumentFrequency { get; set; }
111+
int? ITermVectorFilter.MinimumWordLength { get; set; }
112+
int? ITermVectorFilter.MaximumWordLength { get; set; }
37113

114+
/// <summary>
115+
/// Maximum number of terms that must be returned per field. Defaults to 25.
116+
/// </summary>
38117
public TermVectorFilterDescriptor MaximimumNumberOfTerms(int maxNumTerms) => Assign(a => a.MaximumNumberOfTerms = maxNumTerms);
39118

119+
/// <summary>
120+
/// Ignore words with less than this frequency in the source doc. Defaults to 1.
121+
/// </summary>
122+
public TermVectorFilterDescriptor MinimumTermFrequency(int minTermFreq) => Assign(a => a.MinimumTermFrequency = minTermFreq);
123+
124+
/// <summary>
125+
/// Ignore words with more than this frequency in the source doc. Defaults to unbounded.
126+
/// </summary>
127+
public TermVectorFilterDescriptor MaximumTermFrequency(int maxTermFreq) => Assign(a => a.MaximumTermFrequency = maxTermFreq);
128+
129+
/// <summary>
130+
/// Ignore terms which do not occur in at least this many docs. Defaults to 1.
131+
/// </summary>
40132
public TermVectorFilterDescriptor MinimumDocumentFrequency(int minDocFreq) => Assign(a => a.MinimumDocumentFrequency = minDocFreq);
41133

42-
public TermVectorFilterDescriptor MinimumTermFrequency(int minTermFreq) => Assign(a => a.MinimumTermFrequency = minTermFreq);
134+
/// <summary>
135+
/// Ignore words which occur in more than this many docs. Defaults to unbounded.
136+
/// </summary>
137+
public TermVectorFilterDescriptor MaximumDocumentFrequency(int maxDocFreq) => Assign(a => a.MaximumDocumentFrequency = maxDocFreq);
138+
139+
/// <summary>
140+
/// The minimum word length below which words will be ignored. Defaults to 0.
141+
/// </summary>
142+
public TermVectorFilterDescriptor MinimumWordLength(int minWordLength) => Assign(a => a.MinimumWordLength = minWordLength);
143+
144+
/// <summary>
145+
/// The maximum word length above which words will be ignored. Defaults to unbounded.
146+
/// </summary>
147+
public TermVectorFilterDescriptor MaximumWordLength(int maxWordLength) => Assign(a => a.MaximumWordLength = maxWordLength);
43148
}
44149
}

src/Nest/Document/Single/TermVectors/TermVectorsRequest.cs

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,22 @@ public partial interface ITermVectorsRequest<TDocument>
99
where TDocument : class
1010
{
1111
/// <summary>
12-
/// An optional document to get termvectors for instead of using an already indexed document
12+
/// An optional document to get term vectors for instead of using an already indexed document
1313
/// </summary>
1414
[JsonProperty("doc")]
1515
TDocument Document { get; set; }
1616

17+
/// <summary>
18+
/// Provide a different analyzer than the one at the field.
19+
/// This is useful in order to generate term vectors in any fashion, especially when using artificial documents.
20+
/// </summary>
1721
[JsonProperty("per_field_analyzer")]
1822
IPerFieldAnalyzer PerFieldAnalyzer { get; set; }
1923

24+
/// <summary>
25+
/// Filter the terms returned based on their TF-IDF scores.
26+
/// This can be useful in order find out a good characteristic vector of a document.
27+
/// </summary>
2028
[JsonProperty("filter")]
2129
ITermVectorFilter Filter { get; set; }
2230
}
@@ -26,10 +34,21 @@ public partial class TermVectorsRequest<TDocument>
2634
{
2735
HttpMethod IRequest.HttpMethod => (this.Document != null || this.Filter != null) ? HttpMethod.POST : HttpMethod.GET;
2836

37+
/// <summary>
38+
/// An optional document to get term vectors for instead of using an already indexed document
39+
/// </summary>
2940
public TDocument Document { get; set; }
3041

42+
/// <summary>
43+
/// Provide a different analyzer than the one at the field.
44+
/// This is useful in order to generate term vectors in any fashion, especially when using artificial documents.
45+
/// </summary>
3146
public IPerFieldAnalyzer PerFieldAnalyzer { get; set; }
3247

48+
/// <summary>
49+
/// Filter the terms returned based on their TF-IDF scores.
50+
/// This can be useful in order find out a good characteristic vector of a document.
51+
/// </summary>
3352
public ITermVectorFilter Filter { get; set; }
3453

3554
partial void DocumentFromPath(TDocument document)
@@ -51,11 +70,22 @@ public partial class TermVectorsDescriptor<TDocument> where TDocument : class
5170

5271
ITermVectorFilter ITermVectorsRequest<TDocument>.Filter { get; set; }
5372

73+
/// <summary>
74+
/// An optional document to get term vectors for instead of using an already indexed document
75+
/// </summary>
5476
public TermVectorsDescriptor<TDocument> Document(TDocument document) => Assign(a => a.Document = document);
5577

78+
/// <summary>
79+
/// Provide a different analyzer than the one at the field.
80+
/// This is useful in order to generate term vectors in any fashion, especially when using artificial documents.
81+
/// </summary>
5682
public TermVectorsDescriptor<TDocument> PerFieldAnalyzer(Func<PerFieldAnalyzerDescriptor<TDocument>, IPromise<IPerFieldAnalyzer>> analyzerSelector) =>
5783
Assign(a => a.PerFieldAnalyzer = analyzerSelector?.Invoke(new PerFieldAnalyzerDescriptor<TDocument>())?.Value);
5884

85+
/// <summary>
86+
/// Filter the terms returned based on their TF-IDF scores.
87+
/// This can be useful in order find out a good characteristic vector of a document.
88+
/// </summary>
5989
public TermVectorsDescriptor<TDocument> Filter(Func<TermVectorFilterDescriptor, ITermVectorFilter> filterSelector) =>
6090
Assign(a => a.Filter = filterSelector?.Invoke(new TermVectorFilterDescriptor()));
6191
}

src/Tests/Document/Single/TermVectors/TermVectorsApiTests.cs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,11 @@ protected override LazyResponses ClientUsage() => Calls(
3333
{
3434
max_num_terms = 3,
3535
min_term_freq = 1,
36-
min_doc_freq = 1
36+
max_term_freq = 10,
37+
min_doc_freq = 1,
38+
max_doc_freq = int.MaxValue,
39+
min_word_length = 0,
40+
max_word_length = 200
3741
}
3842
};
3943

@@ -45,7 +49,11 @@ protected override LazyResponses ClientUsage() => Calls(
4549
.Filter(f => f
4650
.MaximimumNumberOfTerms(3)
4751
.MinimumTermFrequency(1)
52+
.MaximumTermFrequency(10)
4853
.MinimumDocumentFrequency(1)
54+
.MaximumDocumentFrequency(int.MaxValue)
55+
.MinimumWordLength(0)
56+
.MaximumWordLength(200)
4957
)
5058
;
5159

@@ -56,7 +64,11 @@ protected override LazyResponses ClientUsage() => Calls(
5664
{
5765
MaximumNumberOfTerms = 3,
5866
MinimumTermFrequency = 1,
59-
MinimumDocumentFrequency = 1
67+
MaximumTermFrequency = 10,
68+
MinimumDocumentFrequency = 1,
69+
MaximumDocumentFrequency = int.MaxValue,
70+
MinimumWordLength = 0,
71+
MaximumWordLength = 200
6072
}
6173
};
6274
}

0 commit comments

Comments
 (0)