Skip to content

Commit 21d29a2

Browse files
authored
Merge pull request hpcc-systems#20698 from ghalliday/issue35365
HPCC-35365 Add LENGTHSIZE attribute to STRING and UNICODE fields Reviewed-By: Dan S. Camper <dan.camper@lexisnexisrisk.com> Merged-by: Gavin Halliday <gavin.halliday@lexisnexisrisk.com>
2 parents 4a46437 + 6948c2b commit 21d29a2

35 files changed

+1515
-293
lines changed

common/deftype/deftype.cpp

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1540,7 +1540,7 @@ extern DEFTYPE_API ITypeInfo *makeUnicodeType(unsigned len, IAtom * locale)
15401540
else
15411541
{
15421542
if(isUnknownLength(len))
1543-
ret = new CUnicodeTypeInfo(UNKNOWN_LENGTH, locale);
1543+
ret = new CUnicodeTypeInfo(len, locale);
15441544
else
15451545
ret = new CUnicodeTypeInfo(len*2, locale);
15461546
utt->setValue(key, ret);
@@ -1565,7 +1565,7 @@ extern DEFTYPE_API ITypeInfo *makeVarUnicodeType(unsigned len, IAtom * locale)
15651565
else
15661566
{
15671567
if(isUnknownLength(len))
1568-
ret = new CVarUnicodeTypeInfo(UNKNOWN_LENGTH, locale);
1568+
ret = new CVarUnicodeTypeInfo(len, locale);
15691569
else
15701570
ret = new CVarUnicodeTypeInfo((len+1)*2, locale);
15711571
vutt->setValue(key, ret);
@@ -1589,7 +1589,7 @@ extern DEFTYPE_API ITypeInfo *makeUtf8Type(unsigned len, IAtom * locale)
15891589
else
15901590
{
15911591
if (isUnknownLength(len))
1592-
ret = new CUtf8TypeInfo(UNKNOWN_LENGTH, locale);
1592+
ret = new CUtf8TypeInfo(len, locale);
15931593
else
15941594
ret = new CUtf8TypeInfo(len*4, locale);
15951595
u8tt->setValue(key, ret);
@@ -3026,6 +3026,18 @@ ICharsetInfo * getAsciiCharset()
30263026

30273027
ITypeInfo * getStretchedType(unsigned newLen, ITypeInfo * type)
30283028
{
3029+
#ifdef PRESERVE_STRETCHED_MODIFIERS
3030+
// This code is not currently used, but preserved as a reminder that it might be needed in future.
3031+
if (type->queryModifier() != typemod_none)
3032+
{
3033+
ITypeInfo * srcType = type->queryTypeBase();
3034+
ITypeInfo * stretchedType = getStretchedType(newLen, srcType);
3035+
if (stretchedType == srcType)
3036+
return LINK(type);
3037+
return cloneModifier(type, stretchedType);
3038+
}
3039+
#endif
3040+
30293041
switch (type->getTypeCode())
30303042
{
30313043
case type_string:
@@ -3065,6 +3077,8 @@ ITypeInfo * getMaxLengthType(ITypeInfo * type)
30653077
case type_utf8:
30663078
case type_qstring:
30673079
case type_data:
3080+
if (isUnknownLength(type->getStringLen()))
3081+
return LINK(type);
30683082
return getStretchedType(UNKNOWN_LENGTH, type);
30693083
default:
30703084
return LINK(type);
@@ -3328,6 +3342,25 @@ ITypeInfo * replaceChildType(ITypeInfo * type, ITypeInfo * newChild)
33283342
return cloneModifiers(type, newType);
33293343
}
33303344

3345+
bool canOverrideStringLength(type_t tc)
3346+
{
3347+
switch (tc)
3348+
{
3349+
case type_string:
3350+
case type_unicode:
3351+
case type_utf8:
3352+
case type_qstring:
3353+
case type_data:
3354+
return true;
3355+
}
3356+
return false;
3357+
}
3358+
3359+
bool canOverrideStringLength(ITypeInfo * type)
3360+
{
3361+
return canOverrideStringLength(type->getTypeCode());
3362+
}
3363+
33313364
//---------------------------------------------------------------------------
33323365
extern DEFTYPE_API ICharsetInfo * deserializeCharsetInfo(MemoryBuffer &src)
33333366
{

common/deftype/deftype.hpp

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,16 @@ enum typemod_t
6666
};
6767

6868
#define INFINITE_LENGTH 0xFFFFFFF0
69-
#define UNKNOWN_LENGTH 0xFFFFFFF1
69+
70+
#ifndef UNKNOWN_LENGTH
71+
#define UNKNOWN_LENGTH 0xFFFFFFF1 // This value may be persisted, so don't change it
7072
#define UNKNOWN_LENGTH1 0xFFFFFFF2 // strings with 1 byte length prefix
7173
#define UNKNOWN_LENGTH2 0xFFFFFFF3 // strings with 2 byte length prefix
74+
#else
75+
static_assert(UNKNOWN_LENGTH == 0xFFFFFFF1);
76+
static_assert(UNKNOWN_LENGTH1 == 0xFFFFFFF2);
77+
static_assert(UNKNOWN_LENGTH2 == 0xFFFFFFF3);
78+
#endif
7279

7380
#define MAX_SUPPORTED_LENGTH (INFINITE_LENGTH-1U)
7481

@@ -77,6 +84,34 @@ inline bool isUnknownLength(size32_t length)
7784
{
7885
return length >= UNKNOWN_LENGTH;
7986
}
87+
inline unsigned getUnknownLengthValue(size32_t lengthSize)
88+
{
89+
switch (lengthSize)
90+
{
91+
case 1: return UNKNOWN_LENGTH1;
92+
case 2: return UNKNOWN_LENGTH2;
93+
}
94+
return UNKNOWN_LENGTH;
95+
}
96+
inline unsigned getLengthSizeBytes(unsigned length)
97+
{
98+
switch (length)
99+
{
100+
case UNKNOWN_LENGTH1: return 1;
101+
case UNKNOWN_LENGTH2: return 2;
102+
case UNKNOWN_LENGTH: return 4;
103+
}
104+
return 0;
105+
}
106+
inline size32_t getUnknownLengthMax(size32_t length)
107+
{
108+
switch (length)
109+
{
110+
case UNKNOWN_LENGTH1: return 0xFF;
111+
case UNKNOWN_LENGTH2: return 0xFFFF;
112+
}
113+
return INFINITE_LENGTH-1;
114+
}
80115

81116
typedef enum type_vals type_t;
82117

@@ -266,6 +301,8 @@ inline bool isAnyType(ITypeInfo * type) { return type && (type->getTypeCode() ==
266301
inline bool isDecimalType(ITypeInfo * type) { return type && (type->getTypeCode() == type_decimal); }
267302
inline bool isDictionaryType(ITypeInfo * type) { return type && (type->getTypeCode() == type_dictionary); }
268303

304+
extern DEFTYPE_API bool canOverrideStringLength(type_t tc);
305+
extern DEFTYPE_API bool canOverrideStringLength(ITypeInfo * type);
269306

270307
//If casting a value from type before to type after is the value preserved.
271308
//If the value is not preserved then it means more than one source value can match a target value.

common/deftype/deftype.ipp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,14 @@ public:
190190
CUtf8TypeInfo(unsigned len, IAtom * _locale) : CUnicodeTypeInfo(len, _locale) {}
191191

192192
virtual type_t getTypeCode() const { return type_utf8; };
193-
virtual unsigned getSize() { return UNKNOWN_LENGTH; };
193+
virtual unsigned getSize()
194+
{
195+
//If unknown length then preserve the size-bytes information
196+
if (isUnknownLength(length))
197+
return length;
198+
//the size of a utf8-encoded type is never known
199+
return UNKNOWN_LENGTH;
200+
}
194201
virtual unsigned getStringLen() { return !isUnknownLength(length) ? length/4 : length; };
195202

196203
using CUnicodeTypeInfo::castFrom;

common/fileview2/fvresultset.cpp

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -335,25 +335,26 @@ void CResultSetMetaData::calcFieldOffsets(const byte * data, unsigned * offsets)
335335
if (isUnknownLength(size))
336336
{
337337
const byte * cur = data + curOffset;
338+
unsigned lengthSize = getLengthSizeBytes(size);
338339
switch (type.getTypeCode())
339340
{
340341
case type_data:
341342
case type_string:
342343
case type_table:
343344
case type_groupedtable:
344-
size = *((unsigned *)cur) + sizeof(unsigned);
345+
size = rtlReadInt(cur, lengthSize) + lengthSize;
345346
break;
346347
case type_set:
347-
size = *((unsigned *)(cur + sizeof(bool))) + sizeof(unsigned) + sizeof(bool);
348+
size = rtlReadInt(cur + sizeof(bool), lengthSize) + lengthSize + sizeof(bool);
348349
break;
349350
case type_qstring:
350-
size = rtlQStrSize(*((unsigned *)cur)) + sizeof(unsigned);
351+
size = rtlQStrSize(rtlReadInt(cur, lengthSize)) + lengthSize;
351352
break;
352353
case type_unicode:
353-
size = *((unsigned *)cur)*sizeof(UChar) + sizeof(unsigned);
354+
size = rtlReadInt(cur, lengthSize) * sizeof(UChar) + lengthSize;
354355
break;
355356
case type_utf8:
356-
size = sizeof(unsigned) + rtlUtf8Size(*(unsigned *)cur, cur+sizeof(unsigned));
357+
size = lengthSize + rtlUtf8Size(rtlReadInt(cur, lengthSize), cur+lengthSize);
357358
break;
358359
case type_varstring:
359360
size = strlen((char *)cur)+1;
@@ -897,8 +898,9 @@ static unsigned getLength(ITypeInfo & type, const byte * & cursor)
897898
unsigned len = type.getStringLen();
898899
if (!isUnknownLength(len))
899900
return len;
900-
len = *(unsigned *)cursor;
901-
cursor += sizeof(unsigned);
901+
unsigned lengthSize = getLengthSizeBytes(len);
902+
len = rtlReadInt(cursor, lengthSize);
903+
cursor += lengthSize;
902904
return len;
903905
}
904906

@@ -989,8 +991,10 @@ IResultSetCursor * CResultSetCursor::getChildren(int columnIndex) const
989991
return NULL;
990992
}
991993

992-
unsigned len = *(unsigned *)cur;
993-
const byte * data = cur + sizeof(unsigned);
994+
unsigned size = type.getSize();
995+
unsigned lengthSize = getLengthSizeBytes(size);
996+
unsigned len = rtlReadInt(cur, lengthSize);
997+
const byte * data = cur + lengthSize;
994998
Owned<IFvDataSource> childData = meta.meta->createChildDataSource(columnIndex, len, data);
995999
Owned<CResultSet> nestedResult = new CResultSet(childData, meta.alwaysUseXPath);
9961000
return nestedResult->createCursor();

common/fileview2/fvsource.cpp

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ void DataSourceMetaData::addSimpleField(const char * name, const char * xpath, I
273273
minRecordSize += sizeof(UChar);
274274
break;
275275
default:
276-
minRecordSize += sizeof(size32_t);
276+
minRecordSize += getLengthSizeBytes(size);
277277
break;
278278
}
279279
}
@@ -623,30 +623,31 @@ size32_t DataSourceMetaData::calcRecordSize(size32_t maxLength, const void *rec)
623623
if (isUnknownLength(size))
624624
{
625625
const byte * cur = data + curOffset;
626+
unsigned lengthSize = getLengthSizeBytes(size);
626627
switch (type.getTypeCode())
627628
{
628629
case type_data:
629630
case type_string:
630631
case type_table:
631632
case type_groupedtable:
632-
checkReadPastEnd(curOffset, sizeof(unsigned), maxLength);
633-
size = *((unsigned *)cur) + sizeof(unsigned);
633+
checkReadPastEnd(curOffset, lengthSize, maxLength);
634+
size = rtlReadInt(cur, lengthSize) + lengthSize;
634635
break;
635636
case type_set:
636-
checkReadPastEnd(curOffset, sizeof(bool) + sizeof(unsigned), maxLength);
637-
size = *((unsigned *)(cur + sizeof(bool))) + sizeof(unsigned) + sizeof(bool);
637+
checkReadPastEnd(curOffset, sizeof(bool) + lengthSize, maxLength);
638+
size = rtlReadInt(cur + sizeof(bool), lengthSize) + lengthSize + sizeof(bool);
638639
break;
639640
case type_qstring:
640-
checkReadPastEnd(curOffset, sizeof(unsigned), maxLength);
641-
size = rtlQStrSize(*((unsigned *)cur)) + sizeof(unsigned);
641+
checkReadPastEnd(curOffset, lengthSize, maxLength);
642+
size = rtlQStrSize(rtlReadInt(cur, lengthSize)) + lengthSize;
642643
break;
643644
case type_unicode:
644-
checkReadPastEnd(curOffset, sizeof(unsigned), maxLength);
645-
size = *((unsigned *)cur)*2 + sizeof(unsigned);
645+
checkReadPastEnd(curOffset, lengthSize, maxLength);
646+
size = rtlReadInt(cur, lengthSize) * sizeof(UChar) + lengthSize;
646647
break;
647648
case type_utf8:
648-
checkReadPastEnd(curOffset, sizeof(unsigned), maxLength);
649-
size = sizeof(unsigned) + rtlUtf8Size(*(unsigned *)cur, cur+sizeof(unsigned));
649+
checkReadPastEnd(curOffset, lengthSize, maxLength);
650+
size = lengthSize + rtlUtf8Size(rtlReadInt(cur, lengthSize), cur+lengthSize);
650651
break;
651652
case type_varstring:
652653
//buffer overflow checking for the following will wait until code is reimplemented

common/workunit/workunit.cpp

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11245,25 +11245,31 @@ void readRow(StringBuffer &out, MemoryBuffer &in, TypeInfoArray &types, StringAt
1124511245
StringAttrItem &name = names.item(idx);
1124611246
ITypeInfo &type = types.item(idx);
1124711247
unsigned size = type.getSize();
11248+
unsigned len = type.getStringLen();
1124811249
switch(type.getTypeCode())
1124911250
{
1125011251
case type_data:
11252+
case type_string:
11253+
case type_unicode:
11254+
case type_utf8:
11255+
case type_qstring:
1125111256
if (isUnknownLength(size))
1125211257
{
11253-
if (in.remaining() < sizeof(int))
11258+
unsigned lengthSize = getLengthSizeBytes(size);
11259+
if (in.remaining() < lengthSize)
1125411260
throw MakeStringException(WUERR_CorruptResult, "corrupt workunit information");
11255-
in.read(size);
11261+
len = rtlReadInt(in.readDirect(lengthSize), lengthSize);
1125611262
}
11257-
outputXmlData(size, in.readDirect(size), name.text, out);
11263+
break;
11264+
}
11265+
11266+
switch(type.getTypeCode())
11267+
{
11268+
case type_data:
11269+
outputXmlData(len, in.readDirect(len), name.text, out);
1125811270
break;
1125911271
case type_string:
11260-
if (isUnknownLength(size))
11261-
{
11262-
if (in.remaining() < sizeof(int))
11263-
throw MakeStringException(WUERR_CorruptResult, "corrupt workunit information");
11264-
in.read(size);
11265-
}
11266-
outputXmlString(size, (const char *) in.readDirect(size), name.text, out);
11272+
outputXmlString(len, (const char *) in.readDirect(len), name.text, out);
1126711273
break;
1126811274
case type_varstring:
1126911275
{
@@ -11274,21 +11280,11 @@ void readRow(StringBuffer &out, MemoryBuffer &in, TypeInfoArray &types, StringAt
1127411280
break;
1127511281
}
1127611282
case type_unicode:
11277-
{
11278-
unsigned len = type.getStringLen();
11279-
if (isUnknownLength(size))
11280-
in.read(len);
11281-
outputXmlUnicode(len, (UChar const *) in.readDirect(len*2), name.text, out);
11282-
}
11283+
outputXmlUnicode(len, (UChar const *) in.readDirect(len*2), name.text, out);
1128311284
break;
1128411285
case type_utf8:
1128511286
{
11286-
unsigned len = type.getStringLen();
11287-
if (isUnknownLength(size))
11288-
{
11289-
in.read(len);
11290-
size = rtlUtf8Size(len, in.readDirect(0));
11291-
}
11287+
size = rtlUtf8Size(len, in.readDirect(0));
1129211288
outputXmlUtf8(len, (const char *) in.readDirect(size), name.text, out);
1129311289
}
1129411290
break;

ecl/hql/hqlattr.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -686,24 +686,25 @@ static unsigned getMaxSize(ITypeInfo * type, IHqlExpression * maxLength, IHqlExp
686686
if (maxSize)
687687
return (unsigned)getIntValue(maxSize, 0);
688688

689+
size32_t lengthSize = getLengthSizeBytes(size);
689690
if (maxLength)
690691
{
691692
unsigned __int64 len = (unsigned)getIntValue(maxLength, 0);
692693
switch (type->getTypeCode())
693694
{
694695
case type_string:
695696
case type_data:
696-
return truncMaxlength(sizeof(size32_t) + len);
697+
return truncMaxlength(lengthSize + len);
697698
case type_unicode:
698-
return truncMaxlength(sizeof(size32_t) + len*sizeof(UChar));
699+
return truncMaxlength(lengthSize + len*sizeof(UChar));
699700
case type_qstring:
700-
return truncMaxlength(sizeof(size32_t) + rtlQStrSize((unsigned)len));
701+
return truncMaxlength(lengthSize + rtlQStrSize((unsigned)len));
701702
case type_varstring:
702703
return truncMaxlength(len + 1);
703704
case type_varunicode:
704705
return truncMaxlength((len + 1) * sizeof(UChar));
705706
case type_utf8:
706-
return truncMaxlength(sizeof(size32_t) + (len * 4));
707+
return truncMaxlength(lengthSize + (len * 4));
707708
case type_set:
708709
return truncMaxlength(len);
709710
}
@@ -727,6 +728,9 @@ static unsigned getMaxSize(ITypeInfo * type, IHqlExpression * maxLength, IHqlExp
727728
}
728729
}
729730

731+
if (lengthSize < 4)
732+
return (1U << (lengthSize*8)) - 1;
733+
730734
return UNKNOWN_LENGTH;
731735
}
732736

@@ -1006,7 +1010,7 @@ static IHqlExpression * evaluateFieldAttrSize(IHqlExpression * expr)
10061010
case type_utf8:
10071011
if (isUnknownLength(thisSize))
10081012
{
1009-
minSize = sizeof(size32_t);
1013+
minSize = getLengthSizeBytes(thisSize);
10101014
maxSize = getMaxSize(expr);
10111015
}
10121016
break;

0 commit comments

Comments
 (0)