diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a60944d..e9094fd 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,6 +13,7 @@ jobs: os: - ubuntu-latest - macos-latest + - windows-latest runs-on: ${{ matrix.os }} steps: - name: Install Go ${{ matrix.go-version }} @@ -20,5 +21,4 @@ jobs: with: go-version: ${{ matrix.go-version }} - uses: actions/checkout@v2 - - run: make install-kiwi - run: make test diff --git a/Makefile b/Makefile index b88c79b..6f47fbc 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,6 @@ ModelGenerator/default.dict: tar -xzvf model.tgz rm -f model.tgz - .PHONY: install-kiwi install-kiwi: bash scripts/install_kiwi.bash $(KIWI_VERSION) diff --git a/include/BitEncoder.hpp b/include/BitEncoder.hpp new file mode 100644 index 0000000..a1330f9 --- /dev/null +++ b/include/BitEncoder.hpp @@ -0,0 +1,487 @@ +#pragma once + +#include +#include +#include "BitUtils.h" + +namespace kiwi +{ + namespace lm + { + namespace detail + { + template + struct gcd + { + static constexpr size_t value = gcd::value; + }; + + template + struct gcd + { + static constexpr size_t value = a; + }; + + template + struct lcm + { + static constexpr size_t value = a * b / gcd::value; + }; + + template using Invoke = typename _T::type; + + template struct seq { using type = seq; }; + + template struct concat; + + template + struct concat, seq<_i2...>> + : seq<_i1..., (sizeof...(_i1) + _i2)...> {}; + + template + using Concat = Invoke>; + + template struct gen_seq; + template using GenSeq = Invoke>; + + template + struct gen_seq : Concat, GenSeq<_n - _n / 2>> {}; + + template<> struct gen_seq<0> : seq<> {}; + template<> struct gen_seq<1> : seq<0> {}; + + template + struct SeqSize; + + template + struct SeqSize> + { + static constexpr size_t value = sizeof...(_i); + }; + + template + struct slice; + + template + using Slice = Invoke>; + + template + struct slice, _j...> + { + using type = Slice, _j..., first>; + }; + + template + struct slice<0, seq, _j...> + { + using type = seq<_j...>; + }; + + template + struct slice<0, seq<>, _j...> + { + using type = seq<_j...>; + }; + + template + struct get; + + template + struct get> : get> + { + }; + + template + struct get<0, seq> : std::integral_constant + { + }; + + template<> + struct get<0, seq<>> + { + }; + } + + template + class FixedLengthEncoder + { + static constexpr size_t packetBits = sizeof(Packet) * 8; + static constexpr size_t bufSize = bits / detail::gcd::value; + static constexpr size_t numPhases = bufSize * packetBits / bits; + static constexpr size_t mask = (1 << bits) - 1; + std::array buf = { {0,} }; + size_t bPhase = 0; + Stream stream; + + void fetch() + { + stream.read((char*)buf.data(), bufSize * sizeof(Packet)); + } + + template + void writePhase(size_t i) + { + constexpr size_t packetPrefix = (bits * phase) / packetBits; + constexpr size_t bitPos = (bits * phase) % packetBits; + constexpr size_t packetBegin = (bits * phase + packetBits - 1) / packetBits; + constexpr size_t packetEnd = (bits * (phase + 1) + packetBits - 1) / packetBits; + + if (bitPos) + { + buf[packetPrefix] |= static_cast(i << bitPos); + i >>= packetBits - bitPos; + } + + for (size_t p = packetBegin; p < packetEnd; ++p) + { + buf[p] = static_cast(i); + i >>= packetBits; + } + + bPhase++; + if (phase == numPhases - 1) + { + flush(); + } + } + + template + void writeDispatch(size_t i, detail::seq) + { + using WriteFn = void(FixedLengthEncoder::*)(size_t); + + static constexpr WriteFn table[] = { + &FixedLengthEncoder::writePhase... + }; + return (this->*table[bPhase])(i); + } + + template + size_t readPhase() + { + constexpr size_t packetPrefix = (bits * phase) / packetBits; + constexpr size_t bitPos = (bits * phase) % packetBits; + constexpr size_t packetBegin = (bits * phase + packetBits - 1) / packetBits; + constexpr size_t packetEnd = (bits * (phase + 1) + packetBits - 1) / packetBits; + constexpr size_t shiftBias = bitPos ? (packetBits - bitPos) : 0; + + if (phase == 0) + { + fetch(); + } + + size_t i = 0; + if (bitPos) + { + i = buf[packetPrefix] >> bitPos; + } + + for (size_t p = packetBegin; p < packetEnd; ++p) + { + i |= buf[p] << (shiftBias + (p - packetBegin) * packetBits); + } + + if (phase == numPhases - 1) + { + bPhase = 0; + } + else + { + bPhase++; + } + return i & mask; + } + + template + size_t readDispatch(detail::seq) + { + using ReadFn = size_t(FixedLengthEncoder::*)(); + + static constexpr ReadFn table[] = { + &FixedLengthEncoder::readPhase... + }; + return (this->*table[bPhase])(); + } + + public: + + template + FixedLengthEncoder(Args&&... args) + : stream( std::forward(args)... ) + { + } + + void write(size_t i) + { + return writeDispatch(i & mask, detail::gen_seq{}); + } + + size_t read() + { + return readDispatch(detail::gen_seq{}); + } + + void flush() + { + stream.write((const char*)buf.data(), ((bPhase * bits + packetBits - 1) / packetBits) * sizeof(Packet)); + std::fill(buf.begin(), buf.end(), 0); + bPhase = 0; + } + + Stream& getStream() { return stream; } + const Stream& getStream() const { return stream; } + }; + + template + using BitSeq = detail::seq; + + namespace detail + { + template + struct VLTransform; + + template + struct VLTransform + { + Encoder& encoder; + + VLTransform(Encoder& _encoder) : encoder( _encoder ) + { + } + + void encode(size_t i) + { + constexpr size_t z = offset + (1 << firstBits); + if (i < z) + { + return encoder.template write(((i - offset) << (depth + 1)) | ((1 << depth) - 1)); + } + return VLTransform{ encoder }.encode(i); + } + + static constexpr size_t bias = VLTransform::bias; + }; + + template + struct VLTransform + { + Encoder& encoder; + + VLTransform(Encoder& _encoder) : encoder( _encoder ) + { + } + + void encode(size_t i) + { + constexpr size_t z = offset + (1 << firstBits); + if (i < z) + { + return encoder.template write(((i - offset) << depth) | ((1 << depth) - 1)); + } + throw std::runtime_error{ "failed to encode. out of range" }; + } + + static constexpr size_t bias = offset + (1 << firstBits); + }; + + template + struct VLTransform + { + Encoder& encoder; + + VLTransform(Encoder& _encoder) : encoder{ _encoder } + { + } + + static constexpr size_t bias = 0; + }; + + template + VLTransform makeVLTransform(Encoder& enc, BitSeq) + { + return { enc }; + } + + inline size_t getPrefixWidth(uint32_t mask) + { + return utils::countTrailingZeroes(~mask); + } + + inline size_t getPrefixWidth(uint64_t mask) + { + return utils::countTrailingZeroes(~mask); + } +#ifdef __APPLE__ + inline size_t getPrefixWidth(size_t mask) { return getPrefixWidth((uint64_t)mask); } +#endif + } + + template + class VariableLengthEncoder + { + template + friend struct detail::VLTransform; + + protected: + static constexpr size_t packetBits = sizeof(Packet) * 8; + std::array buf = { {0,} }; + Packet lastPacket = 0; + ptrdiff_t bitPos = 0; + Stream stream; + + void fetch() + { + lastPacket = buf[bufSize - 1]; + stream.read((char*)buf.data(), bufSize * sizeof(Packet)); + } + + template + void write(size_t i) + { + const ptrdiff_t packetPrefix = bitPos / packetBits; + const ptrdiff_t bitP = bitPos % packetBits; + const ptrdiff_t packetBegin = (bitPos + packetBits - 1) / packetBits; + const ptrdiff_t packetLen = (bitPos + bitwidth + packetBits - 1) / packetBits - packetBegin; + + if (bitP) + { + buf[packetPrefix] |= static_cast(i << bitP); + i >>= packetBits - bitP; + } + + size_t p, pp; + for (p = 0, pp = packetBegin; p < packetLen; ++p, ++pp) + { + if (pp == bufSize) + { + flush(true); + pp = 0; + } + buf[pp] = static_cast(i); + i >>= packetBits; + } + bitPos = (bitPos + bitwidth) % (bufSize * packetBits); + if (bitPos == 0 && pp == bufSize) + { + flush(true); + } + } + + size_t readBits(size_t width) + { + size_t i = 0; + + ptrdiff_t packetPrefix; + ptrdiff_t bitP; + ptrdiff_t packetBegin; + ptrdiff_t packetLen; + ptrdiff_t shiftBias; + if (bitPos < 0) + { + i = lastPacket >> (bitPos + packetBits); + packetPrefix = 0; + bitP = 0; + packetBegin = 0; + packetLen = (bitPos + width + packetBits - 1) / packetBits - packetBegin; + shiftBias = -bitPos; + } + else + { + packetPrefix = bitPos / packetBits; + bitP = bitPos % packetBits; + packetBegin = (bitPos + packetBits - 1) / packetBits; + packetLen = (bitPos + width + packetBits - 1) / packetBits - packetBegin; + shiftBias = bitP ? (packetBits - bitP) : 0; + } + + if (bitP) + { + i = buf[packetPrefix] >> bitP; + } + + size_t p, pp; + for (p = 0, pp = packetBegin; p < packetLen; ++p, ++pp) + { + if (pp == bufSize) + { + fetch(); + pp = 0; + } + i |= buf[pp] << (shiftBias + p * packetBits); + } + if (bitPos > 0 && (bitPos + width) % (bufSize * packetBits) == 0 && pp == bufSize) + { + fetch(); + } + + if (bitPos >= 0) bitPos = (bitPos + width) % (bufSize * packetBits); + else bitPos += width; + return i & ((1 << width) - 1); + } + + template + size_t readV() + { + size_t i = readBits(detail::get::value); + return i + decltype(detail::makeVLTransform(*this, detail::Slice{}))::bias; + } + + template + size_t readVDispatch(size_t width, detail::seq) + { + using ReadFn = size_t(VariableLengthEncoder::*)(); + + static constexpr ReadFn table[] = { + &VariableLengthEncoder::readV... + }; + return (this->*table[width])(); + } + + public: + + static constexpr size_t min_value = 0; + static constexpr size_t max_value = decltype(detail::makeVLTransform(std::declval(), BitSeqs{}))::bias - 1; + + template + VariableLengthEncoder(Args&&... args) + : stream( std::forward(args)... ) + { + } + + void write(size_t i) + { + detail::makeVLTransform(*this, BitSeqs{}).encode(i); + } + + size_t read() + { + constexpr size_t maxPrefixWidth = detail::SeqSize::value - 1; + size_t i = readBits(maxPrefixWidth); + size_t prefixWidth = detail::getPrefixWidth(i); + bitPos -= maxPrefixWidth - std::min(prefixWidth + 1, maxPrefixWidth); + return readVDispatch(prefixWidth, detail::gen_seq::value>{}); + } + + void flush(bool full = false) + { + stream.write((const char*)buf.data(), full ? (bufSize * sizeof(Packet)) : ((bitPos + packetBits - 1) / packetBits * sizeof(Packet))); + std::fill(buf.begin(), buf.end(), 0); + } + + Stream& getStream() { return stream; } + const Stream& getStream() const { return stream; } + }; + + template + class VariableLengthDecoder : public VariableLengthEncoder + { + public: + template + VariableLengthDecoder(Args&&... args) + : VariableLengthEncoder( std::forward(args)... ) + { + this->fetch(); + } + }; + } +} diff --git a/include/BitUtils.h b/include/BitUtils.h new file mode 100644 index 0000000..924eafb --- /dev/null +++ b/include/BitUtils.h @@ -0,0 +1,110 @@ +#pragma once +#include + +#if defined(__SSE2__) || defined(__AVX2__) + #include +#endif + +namespace kiwi +{ + namespace utils + { + inline int countTrailingZeroes(uint32_t v) + { + if (v == 0) + { + return 32; + } +#if defined(__GNUC__) + return __builtin_ctz(v); +#elif defined(_MSC_VER) + unsigned long count; + _BitScanForward(&count, v); + return (int)count; +#else + // See Stanford bithacks, count the consecutive zero bits (trailing) on the + // right with multiply and lookup: + // http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup + static const uint8_t tbl[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, + 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, + 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; + return (int)tbl[((uint32_t)((v & -v) * 0x077CB531U)) >> 27]; +#endif + } + + inline int countTrailingZeroes(uint64_t v) + { + if (v == 0) + { + return 64; + } +#if defined(__GNUC__) + return __builtin_ctzll(v); +#elif defined(_MSC_VER) && defined(_M_X64) + unsigned long count; + _BitScanForward64(&count, v); + return (int)count; +#else + return (uint32_t)v ? countTrailingZeroes((uint32_t)v) + : 32 + countTrailingZeroes((uint32_t)(v >> 32)); +#endif + } + + inline int countLeadingZeroes(uint32_t v) + { + if (v == 0) + { + return 32; + } +#if defined(__GNUC__) + return __builtin_clz(v); +#elif defined(_MSC_VER) + unsigned long count; + _BitScanReverse(&count, v); + // BitScanReverse gives the bit position (0 for the LSB, then 1, etc.) of the + // first bit that is 1, when looking from the MSB. To count leading zeros, we + // need to adjust that. + return 31 - int(count); +#else + // See Stanford bithacks, find the log base 2 of an N-bit integer in + // O(lg(N)) operations with multiply and lookup: + // http://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn + static const uint8_t tbl[32] = { 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, + 13, 9, 6, 28, 1, 23, 19, 11, 3, 16, 14, + 7, 24, 12, 4, 8, 25, 5, 26, 27, 0 }; + v = v | (v >> 1); + v = v | (v >> 2); + v = v | (v >> 4); + v = v | (v >> 8); + v = v | (v >> 16); + return (int)tbl[((uint32_t)(v * 0x07C4ACDDU)) >> 27]; +#endif + } + + inline int countLeadingZeroes(uint64_t v) + { + if (v == 0) + { + return 64; + } +#if defined(__GNUC__) + return __builtin_clzll(v); +#elif defined(_MSC_VER) && defined(_M_X64) + unsigned long count; + _BitScanReverse64(&count, v); + return 63 - int(count); +#else + return v >> 32 ? countLeadingZeroes((uint32_t)(v >> 32)) + : 32 + countLeadingZeroes((uint32_t)v); +#endif + } + + inline int ceilLog2(uint32_t v) { return 32 - countLeadingZeroes(v - 1); } + + inline int ceilLog2(uint64_t v) { return 64 - countLeadingZeroes(v - 1); } + +#ifdef __APPLE__ + inline int ceilLog2(size_t v) { return ceilLog2((uint64_t)v); } +#endif + } +} \ No newline at end of file diff --git a/include/FixedVector.hpp b/include/FixedVector.hpp new file mode 100644 index 0000000..6102180 --- /dev/null +++ b/include/FixedVector.hpp @@ -0,0 +1,77 @@ +#pragma once +#include + +namespace kiwi +{ + template + class FixedVector + { + void* _data = nullptr; + public: + FixedVector(size_t s = 0) + { + if (s) + { + _data = std::malloc(sizeof(Ty) * s + sizeof(size_t)); + *(size_t*)_data = s; + for (size_t i = 0; i < s; ++i) + { + new (&operator[](i)) Ty; + } + } + else _data = nullptr; + } + + FixedVector(const FixedVector& o) + { + if (!o.empty()) + { + _data = std::malloc(sizeof(Ty) * o.size() + sizeof(size_t)); + *(size_t*)_data = o.size(); + for (size_t i = 0; i < o.size(); ++i) + { + new (&operator[](i)) Ty{ o[i] }; + } + } + } + + FixedVector(FixedVector&& o) + { + std::swap(_data, o._data); + } + + ~FixedVector() + { + if (!_data) return; + for (auto& p : *this) p.~Ty(); + std::free(_data); + } + + FixedVector& operator=(const FixedVector& o) + { + this->~FixedVector(); + new (this) FixedVector(o); + return *this; + } + + FixedVector& operator=(FixedVector&& o) + { + std::swap(_data, o._data); + return *this; + } + + size_t size() const { return _data ? *(const size_t*)_data : 0; } + bool empty() const { return !size(); } + + Ty* data() { return _data ? (Ty*)((size_t*)_data + 1) : nullptr; } + const Ty* data() const { return _data ? (const Ty*)((const size_t*)_data + 1) : nullptr; } + + Ty* begin() { return data(); } + Ty* end() { return data() + size(); } + const Ty* begin() const { return data(); } + const Ty* end() const { return data() + size(); } + + Ty& operator[](size_t i) { return data()[i]; } + const Ty& operator[](size_t i) const { return data()[i]; } + }; +} \ No newline at end of file diff --git a/include/Form.h b/include/Form.h new file mode 100644 index 0000000..2e3150a --- /dev/null +++ b/include/Form.h @@ -0,0 +1,189 @@ +/** + * @file Form.h + * @author bab2min (bab2min@gmail.com) + * @brief 형태 및 형태소에 관한 정보를 담는 구조체들이 선언된 헤더 + * @version 0.10.0 + * @date 2021-09-10 + * + * + */ + +#pragma once + +#include +#include + +namespace kiwi +{ + struct Morpheme; + + /** + * @brief 형태소에 관한 모든 정보를 담는 구조체의 템플릿 + * + * @note 변경가능한 상태로 인덱스와 관련된 값이나 std::vector 등의 길이를 변경할 수 있음. + * `kiwi::KiwiBuilder`에서 사용한다. + * `baked = true`는 변경 불가능한 상태로 인덱스는 모두 포인터로, std::vector는 FixedVector로 변경되어 수정이 불가능한 대신 + * 각 값에 효율적으로 빠르게 접근 가능하다. 이 상태는 `kiwi::Morpheme`이라는 타입의 부모클래스로 쓰이며, + * `kiwi::Kiwi` 내 실제 형태소 분석 단계에 쓰인다. + */ + struct MorphemeRaw + { + uint32_t kform = 0; /**< 형태에 대한 포인터 */ + POSTag tag = POSTag::unknown; /**< 품사 태그 */ + CondVowel vowel = CondVowel::none; /**< 선행형태소의 자/모음 조건 */ + CondPolarity polar = CondPolarity::none; /**< 선행형태소의 모음조화 조건 */ + + /** + * @brief 형태소가 두 부분으로 분할된 경우 결합 번호를 표기하기 위해 사용된다. + * + * @note `덥/VA`, `춥/VA` 등의 형태소는 `어/EC`와 만나면 `더워`, `추워`와 같이 형태가 변화한다. + * 이 경우를 각각 처리하기 보다는 `더/V + ㅂ/V`, `추/V + ㅂ/V`과 같이 분해하면 + * `ㅂ/V` + `어/EC`가 `워`로 변한다는 규칙만으로 처리가 가능해진다. (이 규칙은 `chunks`를 이용해 형태소 정보에 담길 수 있음) + * 그러나 모든 ㅂ으로 끝나는 형태소가 위와 같은 규칙에 결합되면 안된다. + * 예를 들어 `굽/VA`의 경우 `어/EC`와 만나도 `굽어`라고 형태가 유지되기 때문. + * 따라서 `ㅂ/V`이 결합할 수 있는 조건을 명시해서 이 조건과 맞는 경우에만 `더/V + ㅂ/V` -> `덥/VA`과 같이 복원해야 한다. + * `combineSocket`이 0이면 이런 결합 조건이 없는 일반 형태소임을 뜻하며, 0이 아닌 경우 결합 조건을 가지고 분해된 형태소임을 뜻한다. + * `더/V`와 `워/UNK`(`ㅂ/V + 어/EC`)는 예를 들어 3과 같이 동일한 combineSocket을 할당해 둘이 서로 결합이 가능한 형태소임을 식별한다. + */ + uint8_t combineSocket = 0; + + /** + * @brief 여러 형태소가 결합되어 형태가 변경된 경우에 원 형태소 목록을 표기하기 위해 사용된다. + * + * @note `되/VV + 어/EC`의 결합은 `돼`라는 형태로 축약될 수 있다. + * 분석과정에서 `돼`를 만난 경우 역으로 `되/VV + 어/EC`로 분석할 수 있도록 `돼/UNK`를 더미 형태소로 등록하고 + * chunks에는 `되/VV`와 `어/EC`에 대한 포인터를 넣어둔다. + */ + Vector chunks; + + /** + * @brief 분할된 형태소의 원형 형태소를 가리키는 오프셋 + * + * @note `덥/VA`이 `더/V` + `ㅂ/V`으로 분할된 경우 `더/V`는 `덥/VA`에 대한 오프셋을 combined에 저장해둔다. + * `kiwi::Morpheme::getCombined()`를 통해 원형 형태소의 포인터를 구할 수 있음 + * @sa combineSocket + */ + int32_t combined = 0; + float userScore = 0; + + MorphemeRaw(); + ~MorphemeRaw(); + MorphemeRaw(const MorphemeRaw&); + MorphemeRaw(MorphemeRaw&&); + MorphemeRaw& operator=(const MorphemeRaw&); + MorphemeRaw& operator=(MorphemeRaw&&); + + MorphemeRaw( + POSTag _tag, + CondVowel _vowel = CondVowel::none, + CondPolarity _polar = CondPolarity::none, + uint8_t _combineSocket = 0 + ); + + void serializerRead(std::istream& istr); + void serializerWrite(std::ostream& ostr) const; + }; + + /** + * @brief 형태소에 관한 모든 정보를 담는 구조체의 템플릿 + * + * @note 변경 불가능한 상태로 인덱스는 모두 포인터로, std::vector는 FixedVector로 변경되어 수정이 불가능한 대신 + * 각 값에 효율적으로 빠르게 접근 가능하다. `kiwi::Kiwi` 내 실제 형태소 분석 단계에 쓰인다. + */ + struct Morpheme + { + const KString* kform = nullptr; + POSTag tag = POSTag::unknown; + CondVowel vowel = CondVowel::none; + CondPolarity polar = CondPolarity::none; + uint8_t combineSocket = 0; + FixedVector chunks; + int32_t combined = 0; + float userScore = 0; + + Morpheme(); + ~Morpheme(); + Morpheme(const Morpheme&); + Morpheme(Morpheme&&); + Morpheme& operator=(const Morpheme&); + Morpheme& operator=(Morpheme&&); + + std::ostream& print(std::ostream& os) const; + + /** 형태소의 형태를 반환한다. */ + const KString& getForm() const { return *kform; } + + /** 분할된 형태소의 경우 원형 형태소를 반환한다. 그 외에는 자기 자신을 반환한다. */ + const Morpheme* getCombined() const { return this + combined; } + }; + + /** + * @brief 형태에 관한 모든 정보를 담는 구조체의 템플릿 + * + * @note 변경가능한 상태로 인덱스와 관련된 값이나 std::vector 등의 길이를 변경할 수 있음. `kiwi::KiwiBuilder`에서 사용한다. + * `baked = true`는 변경 불가능한 상태로 인덱스는 모두 포인터로, std::vector는 FixedVector로 변경되어 수정이 불가능한 대신 + * 각 값에 효율적으로 빠르게 접근 가능하다. 이 상태는 `kiwi::Form`이라는 타입의 부모클래스로 쓰이며, + * `kiwi::Kiwi` 내 실제 형태소 분석 단계에 쓰인다. + */ + struct FormRaw + { + KString form; /**< 형태 */ + CondVowel vowel = CondVowel::none; /**< 선행형태소의 자/모음 조건 */ + CondPolarity polar = CondPolarity::none; /**< 선행형태소의 모음조화 조건 */ + Vector candidate; + /**< 이 형태에 해당하는 형태소들의 목록 */ + + FormRaw(); + ~FormRaw(); + FormRaw(const FormRaw&); + FormRaw(FormRaw&&); + FormRaw& operator=(const FormRaw&); + FormRaw& operator=(FormRaw&&); + + FormRaw(const KString& _form, CondVowel _vowel, CondPolarity _polar); + bool operator<(const FormRaw& o) const; + + void serializerRead(std::istream& istr); + void serializerWrite(std::ostream& ostr) const; + }; + + /** + * @brief 형태에 관한 모든 정보를 담는 구조체의 템플릿 + * + * @note 변경 불가능한 상태로 인덱스는 모두 포인터로, std::vector는 FixedVector로 변경되어 수정이 불가능한 대신 + * 각 값에 효율적으로 빠르게 접근 가능하다. `kiwi::Kiwi` 내 실제 형태소 분석 단계에 쓰인다. + */ + struct Form + { + KString form; + CondVowel vowel = CondVowel::none; + CondPolarity polar = CondPolarity::none; + FixedVector candidate; + + Form(); + ~Form(); + Form(const Form&); + Form(Form&&); + Form& operator=(const Form&); + Form& operator=(Form&&); + }; + + /** + * @brief 변경가능한 형태 정보를 bake하여 최적화한다. + * + * @param o 변경 가능한 형태 정보 + * @param morphBase 형태소 배열의 시작 위치 + * @return 최적화된 형태 정보 + */ + Form bake(const FormRaw& o, const Morpheme* morphBase); + + /** + * @brief 변경 가능한 형태소 정보를 bake하여 최적화한다. + * + * @param o 변경 가능한 형태소 정보 + * @param morphBase 형태소 배열의 시작 위치 + * @param formBase 형태 배열의 시작 위치 + * @return 최적화된 형태소 정보 + */ + Morpheme bake(const MorphemeRaw& o, const Morpheme* morphBase, const Form* formBase); +} diff --git a/include/FrozenTrie.h b/include/FrozenTrie.h new file mode 100644 index 0000000..0752b19 --- /dev/null +++ b/include/FrozenTrie.h @@ -0,0 +1,80 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace kiwi +{ + namespace utils + { + namespace detail + { + template + struct HasSubmatch {}; + + template + struct HasSubmatch::value>::type> + { + static constexpr Value hasSubmatch = (Value)-1; + }; + + template + struct HasSubmatch::value>::type> + { + static constexpr ptrdiff_t hasSubmatch = -1; + }; + } + + template + class FrozenTrie : public detail::HasSubmatch<_Value> + { + public: + using Key = _Key; + using Value = _Value; + using Diff = _Diff; + + struct Node + { + Key numNexts = 0; + Diff lower = 0; + uint32_t nextOffset = 0; + + const Node* next(const FrozenTrie& ft, Key c) const; + const Node* fail() const; + const Node* findFail(const FrozenTrie& ft, Key c) const; + const Value& val(const FrozenTrie& ft) const; + }; + private: + size_t numNodes = 0; + size_t numNexts = 0; + std::unique_ptr nodes; + std::unique_ptr values; + std::unique_ptr nextKeys; + std::unique_ptr nextDiffs; + + public: + + FrozenTrie() = default; + + template + FrozenTrie(const ContinuousTrie& trie); + + FrozenTrie(const FrozenTrie& o); + FrozenTrie(FrozenTrie&&) = default; + + FrozenTrie& operator=(const FrozenTrie& o); + FrozenTrie& operator=(FrozenTrie&& o) = default; + + bool empty() const { return !numNodes; } + size_t size() const { return numNodes; } + const Node* root() const { return nodes.get(); } + + const Value& value(size_t idx) const { return values[idx]; }; + }; + } +} diff --git a/include/Kiwi.h b/include/Kiwi.h new file mode 100644 index 0000000..6894226 --- /dev/null +++ b/include/Kiwi.h @@ -0,0 +1,363 @@ +/** + * @file Kiwi.h + * @author bab2min (bab2min@gmail.com) + * @brief Kiwi C++ API를 담고 있는 헤더 파일 + * @version 0.10.0 + * @date 2021-08-31 + * + * + */ +#pragma once + +#include +#include +#include +#include "Macro.h" +#include "Types.h" +#include "Form.h" +#include "Utils.h" +#include "Trainer.h" +#include "Trie.hpp" +#include "PatternMatcher.h" +#include "FrozenTrie.h" +#include "Knlm.h" +#include "ThreadPool.h" +#include "WordDetector.h" + +namespace kiwi +{ + struct KTrie; + struct KGraphNode; + struct WordInfo; + + /** + * @brief 실제 형태소 분석을 수행하는 클래스. + * + */ + class Kiwi + { + friend class KiwiBuilder; + friend class PathEvaluator; + + bool integrateAllomorph = true; + float cutOffThreshold = 5; + + std::vector
forms; + std::vector morphemes; + utils::FrozenTrie formTrie; + std::shared_ptr langMdl; + std::unique_ptr pool; + + std::vector analyzeSent(const std::u16string::const_iterator& sBegin, const std::u16string::const_iterator& sEnd, size_t topN, Match matchOptions) const; + + const Morpheme* getDefaultMorpheme(POSTag tag) const; + + public: + /** + * @brief 빈 Kiwi 객체를 생성한다. + * + * @note 이 생성자는 기본 생성자로 이를 통해 생성된 객체는 바로 형태소 분석에 사용할 수 없다. + * kiwi::KiwiBuilder 를 통해 생성된 객체만이 형태소 분석에 사용할 수 있다. + */ + Kiwi(); + + ~Kiwi(); + + Kiwi(const Kiwi&) = delete; + + Kiwi(Kiwi&&); + + Kiwi& operator=(const Kiwi&) = delete; + + Kiwi& operator=(Kiwi&&); + + /** + * @brief 현재 Kiwi 객체가 형태소 분석을 수행할 준비가 되었는지를 알려준다. + * + * @return 형태소 분석 준비가 완료된 경우 true를 반환한다. + * + * @note 기본 생성자를 통해 생성된 경우 언제나 `ready() == false`이며, + * `kiwi::KiwiBuilder`를 통해 생성된 경우 `ready() == true`이다. + */ + bool ready() const { return !forms.empty(); } + + /** + * @brief + * + * @param str + * @param matchOptions + * @return TokenResult + */ + TokenResult analyze(const std::u16string& str, Match matchOptions) const + { + return analyze(str, 1, matchOptions)[0]; + } + + /** + * @brief + * + * @param str + * @param matchOptions + * @return TokenResult + */ + TokenResult analyze(const std::string& str, Match matchOptions) const + { + return analyze(utf8To16(str), matchOptions); + } + + /** + * @brief + * + * @param str + * @param topN + * @param matchOptions + * @return std::vector + */ + std::vector analyze(const std::u16string& str, size_t topN, Match matchOptions) const; + + /** + * @brief + * + * @param str + * @param topN + * @param matchOptions + * @return std::vector + */ + std::vector analyze(const std::string& str, size_t topN, Match matchOptions) const + { + return analyze(utf8To16(str), topN, matchOptions); + } + + /** + * @brief + * + * @param str + * @param topN + * @param matchOptions + * @return std::future> + */ + std::future> asyncAnalyze(const std::string& str, size_t topN, Match matchOptions) const; + + /** + * @brief + * + * @tparam ReaderCallback + * @tparam ResultCallback + * @param topN + * @param reader + * @param resultCallback + * @param matchOptions + */ + template + void analyze(size_t topN, ReaderCallback&& reader, ResultCallback&& resultCallback, Match matchOptions) const + { + if (pool) + { + bool stop = false; + std::deque>> futures; + for (size_t i = 0; i < pool->size() * 2; ++i) + { + auto ustr = reader(); + if (ustr.empty()) + { + stop = true; + break; + } + futures.emplace_back(pool->enqueue([&, ustr](size_t tid) + { + return analyze(ustr, topN, matchOptions); + })); + } + + while (!futures.empty()) + { + resultCallback(futures.front().get()); + futures.pop_front(); + if (!stop) + { + auto ustr = reader(); + if (ustr.empty()) + { + stop = true; + continue; + } + futures.emplace_back(pool->enqueue([&, ustr](size_t tid) + { + return analyze(ustr, topN, matchOptions); + })); + } + } + } + else + { + while(1) + { + auto ustr = reader(); + if (ustr.empty()) break; + resultCallback(analyze(ustr, topN, matchOptions)); + } + } + } + + size_t morphToId(const Morpheme* morph) const + { + if (!morph || morph < morphemes.data()) return -1; + return morph - morphemes.data(); + } + + const Morpheme* idToMorph(size_t morphId) const + { + if (morphId >= morphemes.size()) return nullptr; + return &morphemes[morphId]; + } + + size_t getNumThreads() const + { + return pool ? 1 : pool->size(); + } + + float getCutOffThreshold() const + { + return cutOffThreshold; + } + + void setCutOffThreshold(float v) + { + cutOffThreshold = v; + } + + bool getIntegrateAllomorph() const + { + return integrateAllomorph; + } + + void setIntegrateAllomorph(bool v) + { + integrateAllomorph = v; + } + + const lm::KnLangModelBase* getLangModel() const + { + return langMdl.get(); + } + }; + + /** + * @brief 형태소 분석에 사용될 사전을 관리하고, + * 사전을 바탕으로 실제 형태소 분석을 수행하는 Kiwi의 인스턴스를 생성하는 클래스. + * + */ + class KiwiBuilder + { + std::vector forms; + std::vector morphemes; + std::unordered_map formMap; + std::shared_ptr langMdl; + size_t numThreads = 0; + WordDetector detector; + BuildOption options = BuildOption::none; + + void loadMorphBin(std::istream& is); + void saveMorphBin(std::ostream& os) const; + FormRaw& addForm(KString form, CondVowel vowel, CondPolarity polar); + + using MorphemeMap = std::unordered_map, size_t>; + void loadMMFromTxt(std::istream&& is, MorphemeMap& morphMap, std::unordered_map* posWeightSum, const std::function& selector); + void loadCMFromTxt(std::istream&& is, MorphemeMap& morphMap); + void loadPCMFromTxt(std::istream&& is, MorphemeMap& morphMap); + void addCorpusTo(Vector>& out, std::istream&& is, MorphemeMap& morphMap); + void updateForms(); + public: + struct FromRawData {}; + static constexpr FromRawData fromRawDataTag = {}; + + /** + * @brief KiwiBuilder의 기본 생성자 + * + * @note 이 생성자로 생성된 경우 `ready() == false`인 상태이므로 유효한 Kiwi 객체를 생성할 수 없다. + */ + KiwiBuilder(); + + ~KiwiBuilder(); + + KiwiBuilder(const KiwiBuilder&); + + KiwiBuilder(KiwiBuilder&&); + + KiwiBuilder& operator=(const KiwiBuilder&); + + KiwiBuilder& operator=(KiwiBuilder&&); + + /** + * @brief KiwiBuilder를 raw 데이터로부터 생성한다. + * + * @param rawDataPath + * @param numThreads + * @param options + * + * @note 이 함수는 현재 내부적으로 모델 구축에 쓰인다. + * 추후 공개 데이터로도 쉽게 직접 모델을 구축할 수 있도록 개선된 API를 제공할 예정. + */ + KiwiBuilder(FromRawData, const std::string& rawDataPath, size_t numThreads = 0, BuildOption options = BuildOption::integrateAllomorph | BuildOption::loadDefaultDict); + + /** + * @brief KiwiBuilder를 모델 파일로부터 생성한다. + * + * @param modelPath 모델이 위치한 경로 + * @param numThreads 모델 및 형태소 분석에 사용할 스레드 개수 + * @param options 생성 옵션. `kiwi::BuildOption`을 참조 + */ + KiwiBuilder(const std::string& modelPath, size_t numThreads = 0, BuildOption options = BuildOption::integrateAllomorph | BuildOption::loadDefaultDict); + + /** + * @brief 현재 KiwiBuilder 객체가 유효한 분석 모델을 로딩한 상태인지 알려준다. + * + * @return 유효한 상태면 true를 반환한다. 기본 생성자로 생성한 경우 `ready() == false`이며, + * 다른 생성자로 생성한 경우는 `ready() == true`이다. + */ + bool ready() const + { + return !!langMdl; + } + + void saveModel(const std::string& modelPath) const; + + /** + * @brief + * + * @param str + * @param tag + * @param score + * @return + */ + bool addWord(const std::u16string& str, POSTag tag = POSTag::nnp, float score = 0); + + /** + * @brief + * + * @param dictPath + * @return + */ + size_t loadDictionary(const std::string& dictPath); + + std::vector extractWords(const U16MultipleReader& reader, + size_t minCnt = 10, size_t maxWordLen = 10, float minScore = 0.25, float posThreshold = -3, bool lmFilter = true + ) const; + + std::vector extractAddWords(const U16MultipleReader& reader, + size_t minCnt = 10, size_t maxWordLen = 10, float minScore = 0.25, float posThreshold = -3, bool lmFilter = true + ); + + /** + * @brief 현재 단어 및 사전 설정을 기반으로 Kiwi 객체를 생성한다. + * + * @return 형태소 분석 준비가 완료된 Kiwi의 객체. + */ + Kiwi build() const; + + const lm::KnLangModelBase* getLangModel() const + { + return langMdl.get(); + } + }; +} diff --git a/include/Knlm.h b/include/Knlm.h new file mode 100644 index 0000000..084d67c --- /dev/null +++ b/include/Knlm.h @@ -0,0 +1,156 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "Mmap.h" + +namespace kiwi +{ + namespace lm + { + using Vid = uint16_t; + + struct Header + { + uint64_t num_nodes, node_offset, key_offset, ll_offset, gamma_offset, qtable_offset, htx_offset; + uint64_t unk_id, bos_id, eos_id, vocab_size; + uint8_t order, key_size, diff_size, quantized; + }; + + template + struct Node + { + KeyType num_nexts = 0; + DiffType lower = 0; + uint32_t next_offset = 0; + }; + + class KnLangModelBase + { + protected: + utils::MemoryObject base; + + KnLangModelBase(utils::MemoryObject&& mem) : base{ std::move(mem) } + { + } + + virtual float getLL(ptrdiff_t node_idx, size_t next) const = 0; + virtual std::vector allNextLL(ptrdiff_t node_idx) const = 0; + virtual std::vector allNextLL(ptrdiff_t node_idx, std::vector& next_node_idx) const = 0; + + public: + + virtual ~KnLangModelBase() {} + const Header& getHeader() const { return *reinterpret_cast(base.get()); } + + virtual size_t llSize() const = 0; + virtual const float* getLLBuf() const = 0; + virtual const float* getGammaBuf() const = 0; + + static std::unique_ptr create(utils::MemoryObject&& mem); + + template + static utils::MemoryOwner build(const utils::ContinuousTrie& ngram_cf, + size_t order, size_t min_cf, size_t last_min_cf, + size_t unk_id, size_t bos_id, size_t eos_id, + float unigram_alpha, size_t quantize, bool compress, + const std::vector>* bigram_list = nullptr, + const std::vector* historyTransformer = nullptr + ); + + const utils::MemoryObject& getMemory() const { return base; } + + virtual float progress(ptrdiff_t& node_idx, size_t next) const = 0; + + template + void evaluate(InTy in_first, InTy in_last, OutTy out_first) const + { + ptrdiff_t node_idx = 0; + for (; in_first != in_last; ++in_first) + { + *out_first = progress(node_idx, *in_first); + ++out_first; + } + } + + template + float sum(InTy in_first, InTy in_last, float min_score = -100) const + { + float ret = 0; + ptrdiff_t node_idx = 0; + for (; in_first != in_last; ++in_first) + { + ret += std::max(progress(node_idx, *in_first), min_score); + } + return ret; + } + + template + std::vector getNextLL(InTy in_first, InTy in_last) const + { + ptrdiff_t node_idx = 0; + for (; in_first != in_last; ++in_first) + { + progress(node_idx, *in_first); + } + return allNextLL(node_idx); + } + + template + void predict(InTy in_first, InTy in_last, OutTy out_first) const + { + ptrdiff_t node_idx = 0; + for (; in_first != in_last; ++in_first) + { + progress(node_idx, *in_first); + *out_first = allNextLL(node_idx); + ++out_first; + } + } + + template + void fillIn(PfTy prefix_first, PfTy prefix_last, SfTy suffix_first, SfTy suffix_last, OutTy out_first, bool reduce = true) const + { + ptrdiff_t node_idx = 0; + for (; prefix_first != prefix_last; ++prefix_first) + { + progress(node_idx, *prefix_first); + } + + std::vector next_node_idcs; + *out_first = allNextLL(node_idx, next_node_idcs); + + if (reduce) + { + for (size_t i = 0; i < next_node_idcs.size(); ++i) + { + auto node_idx = next_node_idcs[i]; + for (auto it = suffix_first; it != suffix_last; ++it) + { + (*out_first)[i] += progress(node_idx, *it); + } + } + } + else + { + ++out_first; + for (size_t i = 0; i < next_node_idcs.size(); ++i) + { + auto node_idx = next_node_idcs[i]; + auto out_next = out_first; + for (auto it = suffix_first; it != suffix_last; ++it) + { + (*out_next)[i] = progress(node_idx, *it); + ++out_next; + } + } + } + } + }; + } +} diff --git a/include/Macro.h b/include/Macro.h new file mode 100644 index 0000000..0c0acea --- /dev/null +++ b/include/Macro.h @@ -0,0 +1,10 @@ +#pragma once + +#define KIWI_STR_HELPER(x) #x +#define KIWI_STR(x) KIWI_STR_HELPER(x) + +#define KIWI_VERSION_MAJOR 0 +#define KIWI_VERSION_MINOR 10 +#define KIWI_VERSION_PATCH 2 + +#define KIWI_VERSION_STRING KIWI_STR(KIWI_VERSION_MAJOR) "." KIWI_STR(KIWI_VERSION_MINOR) "." KIWI_STR(KIWI_VERSION_PATCH) diff --git a/include/Mmap.h b/include/Mmap.h new file mode 100644 index 0000000..93503c5 --- /dev/null +++ b/include/Mmap.h @@ -0,0 +1,345 @@ +#pragma once +#include +#include + +#ifdef _WIN32 +#define NOMINMAX +#include +namespace kiwi +{ + namespace utils + { + namespace detail + { + class HandleGuard + { + HANDLE handle = nullptr; + public: + HandleGuard(HANDLE _handle = nullptr) : handle(_handle) + { + } + + HandleGuard(const HandleGuard&) = delete; + HandleGuard& operator =(const HandleGuard&) = delete; + + HandleGuard(HandleGuard&& o) noexcept + { + std::swap(handle, o.handle); + } + + HandleGuard& operator=(HandleGuard&& o) noexcept + { + std::swap(handle, o.handle); + return *this; + } + + ~HandleGuard() + { + if (handle && handle != INVALID_HANDLE_VALUE) + { + CloseHandle(handle); + handle = nullptr; + } + } + + operator HANDLE() const + { + return handle; + } + }; + } + + class MMap + { + const char* view = nullptr; + size_t len = 0; + detail::HandleGuard hFile, hFileMap; + public: + MMap(const std::string& filepath) + { + hFile = CreateFileA(filepath.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, nullptr); + if (hFile == INVALID_HANDLE_VALUE) throw std::ios_base::failure("Cannot open '" + filepath + "'"); + hFileMap = CreateFileMapping(hFile, nullptr, PAGE_READONLY, 0, 0, nullptr); + if (hFileMap == nullptr) throw std::ios_base::failure("Cannot open '" + filepath + "' Code:" + std::to_string(GetLastError())); + view = (const char*)MapViewOfFile(hFileMap, FILE_MAP_READ, 0, 0, 0); + DWORD high; + len = GetFileSize(hFile, &high); + len |= (size_t)high << 32; + } + + MMap(const MMap&) = delete; + MMap& operator=(const MMap&) = delete; + + MMap(MMap&&) = default; + MMap& operator=(MMap&&) = default; + + ~MMap() + { + if (hFileMap) + { + UnmapViewOfFile(view); + hFileMap.~HandleGuard(); + } + } + + const char* get() const { return view; } + size_t size() const { return len; } + }; + } +} +#else +#include +#include +#include +#include +#include + +namespace kiwi +{ + namespace utils + { + namespace detail + { + class FDGuard + { + int fd = 0; + public: + FDGuard(int _fd = 0) : fd(_fd) + { + } + + FDGuard(const FDGuard&) = delete; + FDGuard& operator =(const FDGuard&) = delete; + + FDGuard(FDGuard&& o) + { + std::swap(fd, o.fd); + } + + FDGuard& operator=(FDGuard&& o) + { + std::swap(fd, o.fd); + return *this; + } + + ~FDGuard() + { + if (fd && fd != -1) + { + close(fd); + fd = 0; + } + } + + operator int() const + { + return fd; + } + }; + } + + class MMap + { + const char* view = nullptr; + size_t len = 0; + detail::FDGuard fd; + public: + MMap(const std::string& filepath) + { + fd = open(filepath.c_str(), O_RDONLY); + if (fd == -1) throw std::ios_base::failure("Cannot open '" + filepath + "'"); + struct stat sb; + if (fstat(fd, &sb) < 0) throw std::ios_base::failure("Cannot open '" + filepath + "'"); + len = sb.st_size; + view = (const char*)mmap(nullptr, len, PROT_READ, MAP_PRIVATE, fd, 0); + if (view == MAP_FAILED) throw std::ios_base::failure("Mapping failed"); + } + + MMap(const MMap&) = delete; + MMap& operator=(const MMap&) = delete; + + MMap(MMap&& o) + { + std::swap(view, o.view); + } + + MMap& operator=(MMap&& o) + { + std::swap(view, o.view); + return *this; + } + + ~MMap() + { + if (view) + { + munmap((void*)view, len); + } + } + + const char* get() const { return view; } + size_t size() const { return len; } + }; + } +} +#endif + +#include +#include + +namespace kiwi +{ + namespace utils + { + class MemoryOwner + { + std::unique_ptr _ptr; + size_t _size = 0; + + public: + MemoryOwner() = default; + MemoryOwner(size_t tot_size) + : _ptr{ new char[tot_size] }, _size{ tot_size } + { + } + + void* get() const { return _ptr.get(); } + size_t size() const { return _size; } + }; + + class MemoryObject + { + struct Concept + { + virtual ~Concept() {}; + virtual const void* get() const = 0; + virtual size_t size() const = 0; + }; + + template + struct Model : Concept + { + private: + Ty obj; + public: + Model(const Ty& t) : obj{ t } {} + Model(Ty&& t) : obj{ std::move(t) } {} + + virtual const void* get() const { return obj.get(); } + virtual size_t size() const { return obj.size(); } + }; + + std::shared_ptr obj; + + public: + template + MemoryObject(const Ty& _obj) : obj{ std::make_shared>(std::move(_obj)) } {} + + template + MemoryObject(Ty&& _obj) : obj{ std::make_shared::type>>(std::forward(_obj)) } {} + + MemoryObject(const MemoryObject&) = default; + MemoryObject(MemoryObject&&) = default; + + const void* get() const { return obj->get(); } + size_t size() const { return obj->size(); } + }; + + template + struct membuf : public std::streambuf + { + membuf(char* base, std::ptrdiff_t n) + { + if (read) + { + this->setg(base, base, base + n); + } + + if (write) + { + this->setp(base, base + n); + } + } + + pos_type seekpos(pos_type sp, std::ios_base::openmode which) override { + return seekoff(sp - pos_type(off_type(0)), std::ios_base::beg, which); + } + + pos_type seekoff(off_type off, + std::ios_base::seekdir dir, + std::ios_base::openmode which = std::ios_base::in + ) override { + if (which & std::ios_base::in) + { + if (dir == std::ios_base::cur) + gbump(off); + else if (dir == std::ios_base::end) + setg(eback(), egptr() + off, egptr()); + else if (dir == std::ios_base::beg) + setg(eback(), eback() + off, egptr()); + } + if (which & std::ios_base::out) + { + if (dir == std::ios_base::cur) + pbump(off); + else if (dir == std::ios_base::end) + setp(epptr() + off, epptr()); + else if (dir == std::ios_base::beg) + setp(pbase() + off, epptr()); + } + return gptr() - eback(); + } + + const char* curptr() const + { + return this->gptr(); + } + }; + + class imstream : public std::istream + { + membuf buf; + public: + imstream(const char* base, std::ptrdiff_t n) + : std::istream(&buf), buf((char*)base, n) + { + } + + template + imstream(const Ty& m) : imstream(m.get(), m.size()) + { + } + + const char* curptr() const + { + return buf.curptr(); + } + }; + + class omstream : public std::ostream + { + membuf buf; + public: + omstream(char* base, std::ptrdiff_t n) + : std::ostream(&buf), buf((char*)base, n) + { + } + + template + omstream(const Ty& m) : omstream(m.get(), m.size()) + { + } + }; + + template + Ty read(std::istream& istr) + { + Ty ret; + if (!istr.read((char*)&ret, sizeof(Ty))) + { + throw std::ios_base::failure(std::string{ "reading type '" } + typeid(Ty).name() + "' failed"); + } + return ret; + } + } +} diff --git a/include/PatternMatcher.h b/include/PatternMatcher.h new file mode 100644 index 0000000..a694537 --- /dev/null +++ b/include/PatternMatcher.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include +#include "Types.h" + +namespace kiwi +{ + enum class Match : size_t + { + none = 0, + url = 1 << 0, + email = 1 << 1, + hashtag = 1 << 2, + mention = 1 << 3, + normalizeCoda = 1 << 16, + all = url | email | hashtag | mention, + allWithNormalizing = all | normalizeCoda, + }; + + std::pair matchPattern(const char16_t* first, const char16_t* last, Match matchOptions); +} + +KIWI_DEFINE_ENUM_FLAG_OPERATORS(kiwi::Match); diff --git a/include/ThreadPool.h b/include/ThreadPool.h new file mode 100644 index 0000000..a218813 --- /dev/null +++ b/include/ThreadPool.h @@ -0,0 +1,109 @@ +#pragma once + +/* +A simple C++11 Thread Pool implementation(https://github.com/progschj/ThreadPool) +modified by bab2min to have additional parameter threadId +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace kiwi +{ + namespace utils + { + class ThreadPool + { + public: + ThreadPool(size_t threads = 0, size_t maxQueued = 0); + template + auto enqueue(F&& f, Args&&... args) + ->std::future::type>; + ~ThreadPool(); + size_t size() const { return workers.size(); } + size_t numEnqueued() const { return tasks.size(); } + void joinAll(); + private: + std::vector workers; + std::queue> tasks; + + std::mutex queue_mutex; + std::condition_variable condition, inputCnd; + size_t maxQueued; + bool stop; + }; + + inline ThreadPool::ThreadPool(size_t threads, size_t _maxQueued) + : stop(false), maxQueued(_maxQueued) + { + for (size_t i = 0; i < threads; ++i) + workers.emplace_back([this, i] + { + for (;;) + { + std::function task; + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait(lock, + [this] { return this->stop || !this->tasks.empty(); }); + if (this->stop && this->tasks.empty()) return; + task = std::move(this->tasks.front()); + this->tasks.pop(); + if (this->maxQueued) this->inputCnd.notify_all(); + } + task(i); + } + }); + } + + template + auto ThreadPool::enqueue(F&& f, Args&&... args) + -> std::future::type> + { + using return_type = typename std::result_of::type; + + auto task = std::make_shared< std::packaged_task >( + std::bind(std::forward(f), std::placeholders::_1, std::forward(args)...)); + + std::future res = task->get_future(); + { + std::unique_lock lock(queue_mutex); + + // don't allow enqueueing after stopping the pool + if (stop) throw std::runtime_error("enqueue on stopped ThreadPool"); + if (maxQueued && tasks.size() >= maxQueued) + { + inputCnd.wait(lock, [&]() { return tasks.size() < maxQueued; }); + } + tasks.emplace([task](size_t id) { (*task)(id); }); + } + condition.notify_one(); + return res; + } + + inline void ThreadPool::joinAll() + { + if (stop) return; + + { + std::unique_lock lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for (std::thread& worker : workers) + worker.join(); + } + + inline ThreadPool::~ThreadPool() + { + joinAll(); + } + } +} diff --git a/include/Trainer.h b/include/Trainer.h new file mode 100644 index 0000000..73b4b86 --- /dev/null +++ b/include/Trainer.h @@ -0,0 +1 @@ +#pragma once diff --git a/include/Trie.hpp b/include/Trie.hpp new file mode 100644 index 0000000..daf66ae --- /dev/null +++ b/include/Trie.hpp @@ -0,0 +1,333 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace kiwi +{ + namespace utils + { + template + class ConstAccess : public _Map + { + public: + auto operator[](typename _Map::key_type key) const -> typename _Map::mapped_type + { + auto it = this->find(key); + if (it == this->end()) return {}; + else return it->second; + } + + auto operator[](typename _Map::key_type key) -> typename _Map::mapped_type& + { + auto it = this->find(key); + if (it == this->end()) return this->emplace(key, typename _Map::mapped_type{}).first->second; + else return it->second; + } + }; + + template + class TrieIterator : public _Map::const_iterator + { + using Base = typename _Map::const_iterator; + using Key = typename _Map::key_type; + const _Node* base = nullptr; + public: + + TrieIterator(const Base& it, const _Node* _base) + : Base(it), base(_base) + { + } + + std::pair operator*() const + { + auto p = Base::operator*(); + return std::make_pair(p.first, base + p.second); + } + }; + + template>, class _Trie = void> + struct TrieNode + { + using Node = typename std::conditional::value, TrieNode, _Trie>::type; + using Key = _Key; + using Value = _Value; + using KeyStore = _KeyStore; + using iterator = TrieIterator<_KeyStore, Node>; + _KeyStore next = {}; + _Value val = {}; + int32_t fail = 0; + uint32_t depth = 0; + + TrieNode() {} + ~TrieNode() {} + + Node* getNext(_Key i) const + { + return next[i] ? (Node*)this + next[i] : nullptr; + } + + Node* getFail() const + { + return fail ? (Node*)this + fail : nullptr; + } + + iterator begin() const + { + return { next.begin(), (const Node*)this }; + } + + iterator end() const + { + return { next.end(), (const Node*)this }; + } + + template + Node* build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc) + { + if (first == last) + { + if (!val) val = _val; + return (Node*)this; + } + + auto v = *first; + if (!getNext(v)) + { + next[v] = alloc() - (Node*)this; + getNext(v)->depth = depth + 1; + } + return getNext(v)->build(++first, last, _val, alloc); + } + + template + Node* findNode(_TyIter begin, _TyIter end) + { + if (begin == end) return (Node*)this; + auto n = getNext(*begin); + if (n) return n->findNode(++begin, end); + return nullptr; + } + + template + void traverse(_Func func) + { + if (val) + { + if (func(val)) return; + } + for (auto& p : next) + { + if (getNext(p.first)) + { + getNext(p.first)->traverse(func); + } + } + return; + } + + template + void traverseWithKeys(_Fn&& fn, std::vector<_CKey>& rkeys, size_t maxDepth = -1, bool ignoreNegative = false) const + { + fn((Node*)this, rkeys); + + if (rkeys.size() >= maxDepth) return; + + for (auto& p : next) + { + if (ignoreNegative ? (p.second > 0) : (p.second)) + { + rkeys.emplace_back(p.first); + getNext(p.first)->traverseWithKeys(fn, rkeys, maxDepth, ignoreNegative); + rkeys.pop_back(); + } + } + } + + template + std::pair findMaximumMatch(_Iterator begin, _Iterator end, size_t idxCnt = 0) const + { + if (begin == end) return std::make_pair((Node*)this, idxCnt); + auto n = getNext(*begin); + if (n) + { + auto v = n->findMaximumMatch(++begin, end, idxCnt + 1); + if (v.first->val) return v; + } + return std::make_pair((Node*)this, idxCnt); + } + + Node* findFail(_Key i) const + { + if (!fail) // if this is Root + { + return (Node*)this; + } + else + { + if (getFail()->getNext(i)) // if 'i' node exists + { + return getFail()->getNext(i); + } + else // or loop for failure of this + { + return getFail()->findFail(i); + } + } + } + + void fillFail(bool ignoreNegative = false) + { + std::deque dq; + for (dq.emplace_back((Node*)this); !dq.empty(); dq.pop_front()) + { + auto p = dq.front(); + for (auto&& kv : p->next) + { + auto i = kv.first; + if (ignoreNegative && kv.second < 0) continue; + if (!p->getNext(i)) continue; + p->getNext(i)->fail = p->findFail(i) - p->getNext(i); + dq.emplace_back(p->getNext(i)); + + if (!p->val) + { + for (auto n = p; n->fail; n = n->getFail()) + { + if (!n->val) continue; + p->val = (_Value)-1; + break; + } + } + } + } + } + }; + + template>> + struct TrieNodeEx : public TrieNode<_Key, _Value, _KeyStore, TrieNodeEx<_Key, _Value, _KeyStore>> + { + int32_t parent = 0; + + template + TrieNodeEx* build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc) + { + if (first == last) + { + if (!this->val) this->val = _val; + return this; + } + + auto v = *first; + if (!this->getNext(v)) + { + this->next[v] = alloc() - this; + this->getNext(v)->parent = -this->next[v]; + } + return this->getNext(v)->build(++first, last, _val, alloc); + } + + template + TrieNodeEx* makeNext(const _Key& k, _FnAlloc&& alloc) + { + if (!this->next[k]) + { + this->next[k] = alloc() - this; + this->getNext(k)->parent = -this->next[k]; + auto f = this->getFail(); + if (f) + { + f = f->makeNext(k, std::forward<_FnAlloc>(alloc)); + this->getNext(k)->fail = f - this->getNext(k); + } + else + { + this->getNext(k)->fail = this - this->getNext(k); + } + } + return this + this->next[k]; + } + + TrieNodeEx* getParent() const + { + if (!parent) return nullptr; + return (TrieNodeEx*)this + parent; + } + }; + + template + class ContinuousTrie + { + std::vector<_TrieNode> nodes; + + public: + using Node = _TrieNode; + //using Key = typename Node::Key; + //using Value = typename Node::Value; + + ContinuousTrie() = default; + ContinuousTrie(size_t initSize) : nodes(initSize) {} + ContinuousTrie(size_t initSize, size_t initReserve) + { + nodes.reserve(initReserve); + nodes.resize(initSize); + } + + ContinuousTrie(const ContinuousTrie&) = default; + ContinuousTrie(ContinuousTrie&&) = default; + + ContinuousTrie& operator=(const ContinuousTrie&) = default; + ContinuousTrie& operator=(ContinuousTrie&&) = default; + + bool empty() const { return nodes.empty(); } + size_t size() const { return nodes.size(); } + + auto begin() -> decltype(nodes.begin()) { return nodes.begin(); } + auto begin() const -> decltype(nodes.begin()) { return nodes.begin(); } + auto end() -> decltype(nodes.end()) { return nodes.end(); } + auto end() const -> decltype(nodes.end()) { return nodes.end(); } + + void reserveMore(size_t n) + { + if (nodes.capacity() < nodes.size() + n) + { + nodes.reserve(std::max(nodes.size() + n, nodes.capacity() + nodes.capacity() / 2)); + } + } + + Node& operator[](size_t idx) { return nodes[idx]; } + const Node& operator[](size_t idx) const { return nodes[idx]; } + + Node& root() { return nodes[0]; } + const Node& root() const { return nodes[0]; } + + Node* newNode() + { + nodes.emplace_back(); + return &nodes.back(); + } + + template + Node* build(Iter first, Iter last, Value&& val) + { + size_t insertSize = std::distance(first, last); + reserveMore(insertSize); + + return nodes[0].build(first, last, val, [&]() { return newNode(); }); + } + + void fillFail(bool ignoreNegative = false) + { + return nodes[0].fillFail(ignoreNegative); + } + + template + void traverseWithKeys(_Fn&& fn, std::vector<_CKey>& rkeys, size_t maxDepth = -1, bool ignoreNegative = false) const + { + return nodes[0].traverseWithKeys(std::forward<_Fn>(fn), rkeys, maxDepth, ignoreNegative); + } + }; + } +} diff --git a/include/Types.h b/include/Types.h new file mode 100644 index 0000000..0f40d8f --- /dev/null +++ b/include/Types.h @@ -0,0 +1,280 @@ +/** + * @file Types.h + * @author bab2min (bab2min@gmail.com) + * @brief Kiwi C++ API에 쓰이는 주요 타입들을 모아놓은 헤더 파일 + * @version 0.10.0 + * @date 2021-08-31 + * + * + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef KIWI_USE_MIMALLOC +#include +#endif + +#define KIWI_DEFINE_ENUM_FLAG_OPERATORS(Type) \ +inline Type operator~(Type a)\ +{\ + return static_cast(~static_cast::type>(a));\ +}\ +inline bool operator!(Type a)\ +{\ + return a == static_cast(0);\ +}\ +inline Type operator|(Type a, Type b)\ +{\ + return static_cast(static_cast::type>(a) | static_cast::type>(b));\ +}\ +inline Type operator&(Type a, Type b)\ +{\ + return static_cast(static_cast::type>(a) & static_cast::type>(b));\ +}\ +inline Type operator^(Type a, Type b)\ +{\ + return static_cast(static_cast::type>(a) ^ static_cast::type>(b));\ +}\ +inline Type operator|=(Type& a, Type b)\ +{\ + return reinterpret_cast(reinterpret_cast::type&>(a) |= static_cast::type>(b));\ +}\ +inline Type operator&=(Type& a, Type b)\ +{\ + return reinterpret_cast(reinterpret_cast::type&>(a) &= static_cast::type>(b));\ +}\ +inline Type operator^=(Type& a, Type b)\ +{\ + return reinterpret_cast(reinterpret_cast::type&>(a) ^= static_cast::type>(b));\ +} + +namespace kiwi +{ + typedef char16_t kchar_t; + + class Exception : public std::runtime_error + { + public: + using std::runtime_error::runtime_error; + }; + + class UnicodeException : public Exception + { + public: + using Exception::Exception; + }; + +#ifdef KIWI_USE_MIMALLOC + template + using Vector = std::vector<_Ty, mi_stl_allocator<_Ty>>; + + template + using UnorderedMap = std::unordered_map<_K, _V, std::hash<_K>, std::equal_to<_K>, mi_stl_allocator>>; + + using KString = std::basic_string, mi_stl_allocator>; + using KStringStream = std::basic_stringstream, mi_stl_allocator>; + using KcVector = Vector; + using KcScores = Vector>; +#else + /** + * @brief std::vector의 내부용 타입. mimalloc 옵션에 따라 mi_stl_allocator로부터 메모리를 할당받는다. + * + * @note Vector는 std::vector와 동일한 역할을 수행하지만, + * mimalloc 사용시 Vector가 좀 더 빠른 속도로 메모리를 할당 받을 수 있음. + * Vector와 std::vector는 섞어 쓸 수 없다. + * Kiwi 내부에서만 사용할 것이라면 Vector를, 외부로 반환해야할 값이라면 std::vector를 사용할 것. + */ + template + using Vector = std::vector<_Ty>; + + /** + * @brief std::unordered_map의 내부용 타입. mimalloc 옵션에 따라 mi_stl_allocator로부터 메모리를 할당받는다. + * + * @note UnorderMap은 std::unordered_map과 동일한 역할을 수행하지만, + * mimalloc 사용시 UnorderMap이 좀 더 빠른 속도로 메모리를 할당 받을 수 있음. + * @sa Vector + */ + template + using UnorderedMap = std::unordered_map<_K, _V>; + + /** + * @brief std::u16string의 내부용 타입. mimalloc 옵션에 따라 mi_stl_allocator로부터 메모리를 할당받는다. + * + * @note KString은 std::u16string과 동일한 역할을 수행하지만, + * mimalloc 사용시 KString이 좀 더 빠른 속도로 메모리를 할당 받을 수 있음. + * @sa Vector + */ + using KString = std::basic_string; + using KStringStream = std::basic_stringstream; + using KcVector = Vector; + using KcScores = Vector>; +#endif + + /** + * @brief 형태소 품사 태그와 관련된 열거형 + * + * @note 나머지 품사 태그에 대한 정보는 README.md 를 참조할 것. + */ + enum class POSTag : uint8_t + { + unknown, /**< 미설정 */ + nng, nnp, nnb, + vv, va, + mag, + nr, np, + vx, + mm, maj, + ic, + xpn, xsn, xsv, xsa, xr, + vcp, vcn, + sf, sp, ss, se, so, sw, + sl, sh, sn, + w_url, w_email, w_mention, w_hashtag, + jks, jkc, jkg, jko, jkb, jkv, jkq, jx, jc, + ep, ef, ec, etn, etm, + v, /**< 분할된 동사/형용사를 나타내는데 사용됨 */ + max, /**< POSTag의 총 개수를 나타내는 용도 */ + }; + + constexpr size_t defaultTagSize = (size_t)POSTag::jks; + + /** + * @brief 선행 형태소의 종성 여부 조건과 관련된 열거형 + * + */ + enum class CondVowel : uint8_t + { + none, /**< 조건이 설정되지 않음 */ + any, /**< 자음, 모음 여부와 상관 없이 등장 가능 */ + vowel, /**< 선행 형태소가 받침이 없는 경우만 등장 가능*/ + vocalic, /**< 선행 형태소가 받침이 없거나 ㄹ받침인 경우만 등장 가능*/ + vocalic_h, /**< 선행 형태소가 받침이 없거나 ㄹ, ㅎ 받침인 경우만 등장 가능 */ + non_vowel, /**< `vowel`의 부정 */ + non_vocalic, /**< `vocalic`의 부정 */ + non_vocalic_h, /**< `vocalic_h`의 부정 */ + }; + + /** + * @brief 선행 형태소의 양/음성 조건(모음 조화)과 관련된 열거형 + * + */ + enum class CondPolarity : char + { + none, /**< 조건이 설정되지 않음 */ + positive, /**< 선행 형태소가 양성(ㅏ,ㅑ,ㅗ)인 경우만 등장 가능 */ + negative, /**< 선행 형태소가 음성(그 외)인 경우만 등장 가능 */ + }; + + /** + * @brief KiwiBuilder 생성시 사용되는 비트 플래그 + * + * @sa `kiwi::KiwiBuilder` + */ + enum class BuildOption + { + none = 0, + + integrateAllomorph = 1 << 0, /**< 이형태 통합 여부를 설정한다. 이 옵션을 사용시 `아/EC, 어/EC, 여/EC` 와 같은 형태소들이 `어/EC`로 통합되어 출력된다. */ + + loadDefaultDict = 1 << 1, /**< 기본 사전(default.dict)의 로딩 여부를 설정한다. 기본 사전은 위키백과 및 나무위키의 표제어로 구성되어 있다. */ + }; + + struct Morpheme; + + /** + * @brief 분석 완료된 각 형태소들의 정보를 담는 구조체 + * + */ + struct TokenInfo + { + std::u16string str; /**< 형태 */ + uint32_t position = 0; /**< 시작 위치(UTF16 문자 기준) */ + uint16_t length = 0; /**< 길이(UTF16 문자 기준) */ + uint16_t wordPosition = 0; /**< 어절 번호(공백 기준)*/ + POSTag tag = POSTag::unknown; /**< 품사 태그 */ + const Morpheme* morph = nullptr; /**< 기타 형태소 정보에 대한 포인터 (OOV인 경우 nullptr) */ + + TokenInfo() = default; + + TokenInfo(const std::u16string& _str, + POSTag _tag = POSTag::unknown, + uint16_t _length = 0, + uint32_t _position = 0, + uint16_t _wordPosition = 0 + ) + : str{ _str }, position{ _position }, length{ _length }, wordPosition{ _wordPosition }, tag{ _tag } + { + } + + bool operator==(const TokenInfo& o) const + { + return str == o.str && tag == o.tag; + } + + bool operator!=(const TokenInfo& o) const + { + return !operator==(o); + } + }; + + struct FormCond + { + KString form; + CondVowel vowel; + CondPolarity polar; + + FormCond(); + ~FormCond(); + FormCond(const FormCond&); + FormCond(FormCond&&); + FormCond& operator=(const FormCond&); + FormCond& operator=(FormCond&&); + + FormCond(const KString& _form, CondVowel _vowel, CondPolarity _polar); + bool operator==(const FormCond& o) const; + bool operator!=(const FormCond& o) const; + }; + + /** + * @brief 분석 완료된 형태소의 목록(`std::vector`)과 점수(`float`)의 pair 타입 + * + */ + using TokenResult = std::pair, float>; + + using U16Reader = std::function; + using U16MultipleReader = std::function; +} + +namespace std +{ +#ifdef KIWI_USE_MIMALLOC + template<> + struct hash + { + size_t operator()(const kiwi::KString& s) const + { + return hash>{}({ s.begin(), s.end() }); + } + }; +#endif + + template<> + struct hash + { + size_t operator()(const kiwi::FormCond& fc) const + { + return hash{}(fc.form) ^ ((size_t)fc.vowel | ((size_t)fc.polar << 8)); + } + }; +} + +KIWI_DEFINE_ENUM_FLAG_OPERATORS(kiwi::BuildOption); diff --git a/include/Utils.h b/include/Utils.h new file mode 100644 index 0000000..204da91 --- /dev/null +++ b/include/Utils.h @@ -0,0 +1,174 @@ +#pragma once +#include +#include +#include +#include "Types.h" + +namespace kiwi +{ + template::value, int>::type = 0 + > + std::unique_ptr make_unique(Args&&... args) + { + return std::unique_ptr(new T(std::forward(args)...)); + } + + template::value, int>::type = 0 + > + std::unique_ptr make_unique(size_t size) + { + return std::unique_ptr(new typename std::remove_extent::type[size]); + } + + std::u16string utf8To16(const std::string& str); + std::string utf16To8(const std::u16string& str); + + inline bool isWebTag(POSTag t) + { + return POSTag::w_url <= t && t <= POSTag::w_hashtag; + } + + POSTag toPOSTag(const std::u16string& tagStr); + const char* tagToString(POSTag t); + const kchar_t* tagToKString(POSTag t); + + inline bool isHangulCoda(int chr) + { + return 0x11A8 <= chr && chr < (0x11A7 + 28); + } + + KString normalizeHangul(const std::u16string& hangul); + std::u16string joinHangul(const KString& hangul); + + template + void split(const std::basic_string& s, BaseChr delim, OutIterator result) + { + size_t p = 0; + while (1) + { + size_t t = s.find(delim, p); + if (t == s.npos) + { + *(result++) = s.substr(p); + break; + } + else + { + *(result++) = s.substr(p, t - p); + p = t + 1; + } + } + } + + template + inline std::vector> split(const std::basic_string& s, BaseChr delim) + { + std::vector> elems; + split(s, delim, std::back_inserter(elems)); + return elems; + } + + + template + inline float stof(ChrIterator begin, ChrIterator end) + { + if (begin == end) return 0; + bool sign = false; + switch (*begin) + { + case '-': + sign = true; + case '+': + ++begin; + break; + } + double up = 0, down = 0; + for (; begin != end; ++begin) + { + if ('0' <= *begin && *begin <= '9') up = up * 10 + (*begin - '0'); + else break; + } + if (begin != end && *begin == '.') + { + ++begin; + float d = 1; + for (; begin != end; ++begin) + { + if ('0' <= *begin && *begin <= '9') + { + down = down * 10 + (*begin - '0'); + d /= 10; + } + else break; + } + up += down * d; + } + return up * (sign ? -1 : 1); + } + + inline std::ostream& operator <<(std::ostream& os, const KString& str) + { + return os << utf16To8({ str.begin(), str.end() }); + } + + POSTag identifySpecialChr(kchar_t chr); + + class SpaceSplitIterator + { + static bool isspace(char16_t c) + { + switch (c) + { + case u' ': + case u'\f': + case u'\n': + case u'\r': + case u'\t': + case u'\v': + return true; + } + return false; + } + + std::u16string::const_iterator mBegin, mChunk, mEnd; + public: + SpaceSplitIterator(const std::u16string::const_iterator& _begin = {}, const std::u16string::const_iterator& _end = {}) + : mBegin(_begin), mEnd(_end) + { + while (mBegin != mEnd && isspace(*mBegin)) ++mBegin; + mChunk = mBegin; + while (mChunk != mEnd && !isspace(*mChunk)) ++mChunk; + } + + SpaceSplitIterator& operator++() + { + mBegin = mChunk; + while (mBegin != mEnd && isspace(*mBegin)) ++mBegin; + mChunk = mBegin; + while (mChunk != mEnd && !isspace(*mChunk)) ++mChunk; + return *this; + } + + bool operator==(const SpaceSplitIterator& o) const + { + if (o.mBegin == o.mEnd) return mBegin == mEnd; + return mBegin == o.mBegin; + } + + bool operator!=(const SpaceSplitIterator& o) const + { + return !operator==(o); + } + + std::u16string operator*() const + { + return { mBegin, mChunk }; + } + + std::u16string::const_iterator strBegin() const { return mBegin; } + std::u16string::const_iterator strEnd() const { return mChunk; } + size_t strSize() const { return distance(mBegin, mChunk); } + }; +} \ No newline at end of file diff --git a/include/WordDetector.h b/include/WordDetector.h new file mode 100644 index 0000000..d54cc3d --- /dev/null +++ b/include/WordDetector.h @@ -0,0 +1,57 @@ +#pragma once + +#include + +namespace kiwi +{ + struct WordInfo + { + std::u16string form; + float score, lBranch, rBranch, lCohesion, rCohesion; + uint32_t freq; + std::map posScore; + + WordInfo(std::u16string _form = {}, + float _score = 0, float _lBranch = 0, float _rBranch = 0, + float _lCohesion = 0, float _rCohesion = 0, uint32_t _freq = 0, + std::map&& _posScore = {}) + : form(_form), score(_score), lBranch(_lBranch), rBranch(_rBranch), + lCohesion(_lCohesion), rCohesion(_rCohesion), freq(_freq), posScore(_posScore) + {} + }; + + class WordDetector + { + struct Counter; + protected: + size_t numThreads = 0; + std::map, std::map> posScore; + std::map nounTailScore; + + void loadPOSModelFromTxt(std::istream& is); + void loadNounTailModelFromTxt(std::istream& is); + + void countUnigram(Counter&, const U16Reader& reader, size_t minCnt) const; + void countBigram(Counter&, const U16Reader& reader, size_t minCnt) const; + void countNgram(Counter&, const U16Reader& reader, size_t minCnt, size_t maxWordLen) const; + float branchingEntropy(const std::map& cnt, std::map::iterator it, size_t minCnt, float defaultPerp = 1.f) const; + std::map getPosScore(Counter&, const std::map& cnt, std::map::iterator it, bool coda, const std::u16string& realForm) const; + public: + + struct FromRawData {}; + static constexpr FromRawData fromRawDataTag = {}; + + WordDetector() = default; + WordDetector(const std::string& modelPath, size_t _numThreads = 0); + WordDetector(FromRawData, const std::string& modelPath, size_t _numThreads = 0); + + bool ready() const + { + return !posScore.empty(); + } + + void saveModel(const std::string& modelPath) const; + std::vector extractWords(const U16MultipleReader& reader, size_t minCnt = 10, size_t maxWordLen = 10, float minScore = 0.1f) const; + }; + +} \ No newline at end of file diff --git a/include/capi.h b/include/capi.h new file mode 100644 index 0000000..cbdd31a --- /dev/null +++ b/include/capi.h @@ -0,0 +1,460 @@ +/** + * @file capi.h + * @author bab2min (bab2min@gmail.com) + * @brief Kiwi C API를 담고 있는 헤더 파일 + * @version 0.10.0 + * @date 2021-08-31 + * + * + */ + +#pragma once + +#include "Macro.h" + +#define KIWIERR_FAIL -1 +#define KIWIERR_INVALID_HANDLE -2 +#define KIWIERR_INVALID_INDEX -3 + +#if !defined(DLL_EXPORT) +#define DECL_DLL +#elif defined(_MSC_VER) +#define DECL_DLL __declspec(dllexport) +#elif defined(__GNUC__) +#define DECL_DLL __attribute__((visibility("default"))) +#endif + +typedef struct kiwi_s* kiwi_h; +typedef struct kiwi_builder* kiwi_builder_h; +typedef struct kiwi_res* kiwi_res_h; +typedef struct kiwi_ws* kiwi_ws_h; +typedef unsigned short kchar16_t; + +/* +int (*kiwi_reader_t)(int id, char* buffer, void* user_data) +id: id number of line to be read. if id == 0, kiwi_reader should roll back file and read lines from the beginning +buffer: buffer where string data should be stored. if buffer == null, kiwi_reader provide the length of string as return value. +user_data: user_data from kiwi_extract~, kiwi_perform, kiwi_analyze_m functions. +*/ + +/** + * @brief 문자열을 읽어들여 Kiwi에 제공하기 위한 콜백 함수 타입 + * + * @param int 읽어들일 문자열의 줄 번호입니다. 0부터 시작하여 차례로 1씩 증가합니다. + * @param char* 읽어들인 문자열이 저장될 버퍼의 주소입니다. 이 값이 null인 경우 버퍼의 크기를 반환해야 합니다. + * @param void* user data를 위한 인자입니다. + * + * @return int 두번째 인자가 null인 경우 읽어들일 버퍼의 크기를 반환합니다. + */ +typedef int(*kiwi_reader_t)(int, char*, void*); +typedef int(*kiwi_reader_w_t)(int, kchar16_t*, void*); + + +typedef int(*kiwi_receiver_t)(int, kiwi_res_h, void*); + +enum +{ + KIWI_BUILD_LOAD_DEFAULT_DICT = 1, + KIWI_BUILD_INTEGRATE_ALLOMORPH = 2, + KIWI_BUILD_DEFAULT = 3, +}; + +enum +{ + KIWI_NUM_THREADS = 0x8001, +}; + +enum +{ + KIWI_MATCH_URL = 1, + KIWI_MATCH_EMAIL = 2, + KIWI_MATCH_HASHTAG = 4, + KIWI_MATCH_MENTION = 8, + KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION, + KIWI_MATCH_NORMALIZE_CODA = 65536, + KIWI_MATCH_ALL_WITH_NORMALIZING = KIWI_MATCH_ALL | KIWI_MATCH_NORMALIZE_CODA, +}; + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief 설치된 Kiwi의 버전을 반환합니다. + * + * @return "major.minor.patch"로 구성되는 버전 문자열 + */ +DECL_DLL const char* kiwi_version(); + +/** + * @brief 현재 스레드에서 발생한 에러 메세지를 반환합니다. 발생한 에러가 없을 경우 nullptr를 반환합니다. + * + * @return 에러 메세지 혹은 nullptr + */ +DECL_DLL const char* kiwi_error(); + +/** + * @brief 현재 스레드의 에러 메세지를 초기화합니다. + * + * @return + */ +DECL_DLL void kiwi_clear_error(); + +/** + * @brief Kiwi Builder를 생성합니다 + * + * @param model_path 모델의 경로 + * @param num_threads 사용할 스레드의 개수. 0으로 지정시 가용한 스레드 개수를 자동으로 판단합니다. + * @param options 생성 옵션. KIWI_BUILD_* 열거형을 참조하십시오. + * @return 성공 시 Kiwi Builder의 핸들을 반환합니다. + * 실패시 nullptr를 반환하고 에러 메세지를 설정합니다. + * 에러 메세지는 kiwi_error()를 통해 확인할 수 있습니다. + */ +DECL_DLL kiwi_builder_h kiwi_builder_init(const char* model_path, int num_threads, int options); + +/** + * @brief + * + * @param handle + * @return + */ +DECL_DLL int kiwi_builder_close(kiwi_builder_h handle); + +/** + * @brief + * + * @param handle + * @param word + * @param pos + * @param score + * @return + */ +DECL_DLL int kiwi_builder_add_word(kiwi_builder_h handle, const char* word, const char* pos, float score); + +/** + * @brief + * + * @param handle + * @param dict_path + * @return + */ +DECL_DLL int kiwi_builder_load_dict(kiwi_builder_h handle, const char* dict_path); + +/** + * @brief + * + * @param handle + * @param reader + * @param user_data + * @param min_cnt + * @param max_word_len + * @param min_score + * @param pos_threshold + * @return + */ +DECL_DLL kiwi_ws_h kiwi_builder_extract_words(kiwi_builder_h handle, kiwi_reader_t reader, void* user_data, int min_cnt, int max_word_len, float min_score, float pos_threshold); + +/** + * @brief + * + * @param handle + * @param reader + * @param user_data + * @param min_cnt + * @param max_word_len + * @param min_score + * @param pos_threshold + * @return + */ +DECL_DLL kiwi_ws_h kiwi_builder_extract_add_words(kiwi_builder_h handle, kiwi_reader_t reader, void* user_data, int min_cnt, int max_word_len, float min_score, float pos_threshold); + +/** + * @brief + * + * @param handle + * @param reader + * @param user_data + * @param min_cnt + * @param max_word_len + * @param min_score + * @param pos_threshold + * @return + */ +DECL_DLL kiwi_ws_h kiwi_builder_extract_words_w(kiwi_builder_h handle, kiwi_reader_w_t reader, void* user_data, int min_cnt, int max_word_len, float min_score, float pos_threshold); + +/** + * @brief + * + * @param handle + * @param reader + * @param user_data + * @param min_cnt + * @param max_word_len + * @param min_score + * @param pos_threshold + * @return + */ +DECL_DLL kiwi_ws_h kiwi_builder_extract_add_words_w(kiwi_builder_h handle, kiwi_reader_w_t reader, void* user_data, int min_cnt, int max_word_len, float min_score, float pos_threshold); + +/** + * @brief + * + * @param handle + * @return + */ +DECL_DLL kiwi_h kiwi_builder_build(kiwi_builder_h handle); + +/** + * @brief + * + * @param model_path + * @param num_threads + * @param options + * @return + */ +DECL_DLL kiwi_h kiwi_init(const char* model_path, int num_threads, int options); + +/** + * @brief + * + * @param handle + * @param option + * @param value + * @return + */ +DECL_DLL void kiwi_set_option(kiwi_h handle, int option, int value); + +/** + * @brief + * + * @param handle + * @param option + * @return + */ +DECL_DLL int kiwi_get_option(kiwi_h handle, int option); + +/** + * @brief + * + * @param handle + * @param text + * @param top_n + * @param match_options + * @return + */ +DECL_DLL kiwi_res_h kiwi_analyze_w(kiwi_h handle, const kchar16_t* text, int top_n, int match_options); + +/** + * @brief + * + * @param handle + * @param text + * @param top_n + * @param match_options + * @return + */ +DECL_DLL kiwi_res_h kiwi_analyze(kiwi_h handle, const char* text, int top_n, int match_options); + +/** + * @brief + * + * @param handle + * @param reader + * @param receiver + * @param user_data + * @param top_n + * @param match_options + * @return + */ +DECL_DLL int kiwi_analyze_mw(kiwi_h handle, kiwi_reader_w_t reader, kiwi_receiver_t receiver, void* user_data, int top_n, int match_options); + +/** + * @brief + * + * @param handle + * @param reader + * @param receiver + * @param user_data + * @param top_n + * @param match_options + * @return + */ +DECL_DLL int kiwi_analyze_m(kiwi_h handle, kiwi_reader_t reader, kiwi_receiver_t receiver, void* user_data, int top_n, int match_options); + +/** + * @brief + * + * @param handle + * @return + */ +DECL_DLL int kiwi_close(kiwi_h handle); + +/** + * @brief + * + * @param result + * @return + */ +DECL_DLL int kiwi_res_size(kiwi_res_h result); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL float kiwi_res_prob(kiwi_res_h result, int index); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL int kiwi_res_word_num(kiwi_res_h result, int index); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL const kchar16_t* kiwi_res_form_w(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL const kchar16_t* kiwi_res_tag_w(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL const char* kiwi_res_form(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL const char* kiwi_res_tag(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL int kiwi_res_position(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL int kiwi_res_length(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL int kiwi_res_word_position(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @return + */ +DECL_DLL int kiwi_res_close(kiwi_res_h result); + + +/** + * @brief + * + * @param result + * @return + */ +DECL_DLL int kiwi_ws_size(kiwi_ws_h result); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL const kchar16_t* kiwi_ws_form_w(kiwi_ws_h result, int index); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL const char* kiwi_ws_form(kiwi_ws_h result, int index); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL float kiwi_ws_score(kiwi_ws_h result, int index); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL int kiwi_ws_freq(kiwi_ws_h result, int index); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL float kiwi_ws_pos_score(kiwi_ws_h result, int index); + +/** + * @brief + * + * @param result + * @return + */ +DECL_DLL int kiwi_ws_close(kiwi_ws_h result); + +#ifdef __cplusplus +} +#endif diff --git a/include/kiwi/BitEncoder.hpp b/include/kiwi/BitEncoder.hpp new file mode 100644 index 0000000..a1330f9 --- /dev/null +++ b/include/kiwi/BitEncoder.hpp @@ -0,0 +1,487 @@ +#pragma once + +#include +#include +#include "BitUtils.h" + +namespace kiwi +{ + namespace lm + { + namespace detail + { + template + struct gcd + { + static constexpr size_t value = gcd::value; + }; + + template + struct gcd + { + static constexpr size_t value = a; + }; + + template + struct lcm + { + static constexpr size_t value = a * b / gcd::value; + }; + + template using Invoke = typename _T::type; + + template struct seq { using type = seq; }; + + template struct concat; + + template + struct concat, seq<_i2...>> + : seq<_i1..., (sizeof...(_i1) + _i2)...> {}; + + template + using Concat = Invoke>; + + template struct gen_seq; + template using GenSeq = Invoke>; + + template + struct gen_seq : Concat, GenSeq<_n - _n / 2>> {}; + + template<> struct gen_seq<0> : seq<> {}; + template<> struct gen_seq<1> : seq<0> {}; + + template + struct SeqSize; + + template + struct SeqSize> + { + static constexpr size_t value = sizeof...(_i); + }; + + template + struct slice; + + template + using Slice = Invoke>; + + template + struct slice, _j...> + { + using type = Slice, _j..., first>; + }; + + template + struct slice<0, seq, _j...> + { + using type = seq<_j...>; + }; + + template + struct slice<0, seq<>, _j...> + { + using type = seq<_j...>; + }; + + template + struct get; + + template + struct get> : get> + { + }; + + template + struct get<0, seq> : std::integral_constant + { + }; + + template<> + struct get<0, seq<>> + { + }; + } + + template + class FixedLengthEncoder + { + static constexpr size_t packetBits = sizeof(Packet) * 8; + static constexpr size_t bufSize = bits / detail::gcd::value; + static constexpr size_t numPhases = bufSize * packetBits / bits; + static constexpr size_t mask = (1 << bits) - 1; + std::array buf = { {0,} }; + size_t bPhase = 0; + Stream stream; + + void fetch() + { + stream.read((char*)buf.data(), bufSize * sizeof(Packet)); + } + + template + void writePhase(size_t i) + { + constexpr size_t packetPrefix = (bits * phase) / packetBits; + constexpr size_t bitPos = (bits * phase) % packetBits; + constexpr size_t packetBegin = (bits * phase + packetBits - 1) / packetBits; + constexpr size_t packetEnd = (bits * (phase + 1) + packetBits - 1) / packetBits; + + if (bitPos) + { + buf[packetPrefix] |= static_cast(i << bitPos); + i >>= packetBits - bitPos; + } + + for (size_t p = packetBegin; p < packetEnd; ++p) + { + buf[p] = static_cast(i); + i >>= packetBits; + } + + bPhase++; + if (phase == numPhases - 1) + { + flush(); + } + } + + template + void writeDispatch(size_t i, detail::seq) + { + using WriteFn = void(FixedLengthEncoder::*)(size_t); + + static constexpr WriteFn table[] = { + &FixedLengthEncoder::writePhase... + }; + return (this->*table[bPhase])(i); + } + + template + size_t readPhase() + { + constexpr size_t packetPrefix = (bits * phase) / packetBits; + constexpr size_t bitPos = (bits * phase) % packetBits; + constexpr size_t packetBegin = (bits * phase + packetBits - 1) / packetBits; + constexpr size_t packetEnd = (bits * (phase + 1) + packetBits - 1) / packetBits; + constexpr size_t shiftBias = bitPos ? (packetBits - bitPos) : 0; + + if (phase == 0) + { + fetch(); + } + + size_t i = 0; + if (bitPos) + { + i = buf[packetPrefix] >> bitPos; + } + + for (size_t p = packetBegin; p < packetEnd; ++p) + { + i |= buf[p] << (shiftBias + (p - packetBegin) * packetBits); + } + + if (phase == numPhases - 1) + { + bPhase = 0; + } + else + { + bPhase++; + } + return i & mask; + } + + template + size_t readDispatch(detail::seq) + { + using ReadFn = size_t(FixedLengthEncoder::*)(); + + static constexpr ReadFn table[] = { + &FixedLengthEncoder::readPhase... + }; + return (this->*table[bPhase])(); + } + + public: + + template + FixedLengthEncoder(Args&&... args) + : stream( std::forward(args)... ) + { + } + + void write(size_t i) + { + return writeDispatch(i & mask, detail::gen_seq{}); + } + + size_t read() + { + return readDispatch(detail::gen_seq{}); + } + + void flush() + { + stream.write((const char*)buf.data(), ((bPhase * bits + packetBits - 1) / packetBits) * sizeof(Packet)); + std::fill(buf.begin(), buf.end(), 0); + bPhase = 0; + } + + Stream& getStream() { return stream; } + const Stream& getStream() const { return stream; } + }; + + template + using BitSeq = detail::seq; + + namespace detail + { + template + struct VLTransform; + + template + struct VLTransform + { + Encoder& encoder; + + VLTransform(Encoder& _encoder) : encoder( _encoder ) + { + } + + void encode(size_t i) + { + constexpr size_t z = offset + (1 << firstBits); + if (i < z) + { + return encoder.template write(((i - offset) << (depth + 1)) | ((1 << depth) - 1)); + } + return VLTransform{ encoder }.encode(i); + } + + static constexpr size_t bias = VLTransform::bias; + }; + + template + struct VLTransform + { + Encoder& encoder; + + VLTransform(Encoder& _encoder) : encoder( _encoder ) + { + } + + void encode(size_t i) + { + constexpr size_t z = offset + (1 << firstBits); + if (i < z) + { + return encoder.template write(((i - offset) << depth) | ((1 << depth) - 1)); + } + throw std::runtime_error{ "failed to encode. out of range" }; + } + + static constexpr size_t bias = offset + (1 << firstBits); + }; + + template + struct VLTransform + { + Encoder& encoder; + + VLTransform(Encoder& _encoder) : encoder{ _encoder } + { + } + + static constexpr size_t bias = 0; + }; + + template + VLTransform makeVLTransform(Encoder& enc, BitSeq) + { + return { enc }; + } + + inline size_t getPrefixWidth(uint32_t mask) + { + return utils::countTrailingZeroes(~mask); + } + + inline size_t getPrefixWidth(uint64_t mask) + { + return utils::countTrailingZeroes(~mask); + } +#ifdef __APPLE__ + inline size_t getPrefixWidth(size_t mask) { return getPrefixWidth((uint64_t)mask); } +#endif + } + + template + class VariableLengthEncoder + { + template + friend struct detail::VLTransform; + + protected: + static constexpr size_t packetBits = sizeof(Packet) * 8; + std::array buf = { {0,} }; + Packet lastPacket = 0; + ptrdiff_t bitPos = 0; + Stream stream; + + void fetch() + { + lastPacket = buf[bufSize - 1]; + stream.read((char*)buf.data(), bufSize * sizeof(Packet)); + } + + template + void write(size_t i) + { + const ptrdiff_t packetPrefix = bitPos / packetBits; + const ptrdiff_t bitP = bitPos % packetBits; + const ptrdiff_t packetBegin = (bitPos + packetBits - 1) / packetBits; + const ptrdiff_t packetLen = (bitPos + bitwidth + packetBits - 1) / packetBits - packetBegin; + + if (bitP) + { + buf[packetPrefix] |= static_cast(i << bitP); + i >>= packetBits - bitP; + } + + size_t p, pp; + for (p = 0, pp = packetBegin; p < packetLen; ++p, ++pp) + { + if (pp == bufSize) + { + flush(true); + pp = 0; + } + buf[pp] = static_cast(i); + i >>= packetBits; + } + bitPos = (bitPos + bitwidth) % (bufSize * packetBits); + if (bitPos == 0 && pp == bufSize) + { + flush(true); + } + } + + size_t readBits(size_t width) + { + size_t i = 0; + + ptrdiff_t packetPrefix; + ptrdiff_t bitP; + ptrdiff_t packetBegin; + ptrdiff_t packetLen; + ptrdiff_t shiftBias; + if (bitPos < 0) + { + i = lastPacket >> (bitPos + packetBits); + packetPrefix = 0; + bitP = 0; + packetBegin = 0; + packetLen = (bitPos + width + packetBits - 1) / packetBits - packetBegin; + shiftBias = -bitPos; + } + else + { + packetPrefix = bitPos / packetBits; + bitP = bitPos % packetBits; + packetBegin = (bitPos + packetBits - 1) / packetBits; + packetLen = (bitPos + width + packetBits - 1) / packetBits - packetBegin; + shiftBias = bitP ? (packetBits - bitP) : 0; + } + + if (bitP) + { + i = buf[packetPrefix] >> bitP; + } + + size_t p, pp; + for (p = 0, pp = packetBegin; p < packetLen; ++p, ++pp) + { + if (pp == bufSize) + { + fetch(); + pp = 0; + } + i |= buf[pp] << (shiftBias + p * packetBits); + } + if (bitPos > 0 && (bitPos + width) % (bufSize * packetBits) == 0 && pp == bufSize) + { + fetch(); + } + + if (bitPos >= 0) bitPos = (bitPos + width) % (bufSize * packetBits); + else bitPos += width; + return i & ((1 << width) - 1); + } + + template + size_t readV() + { + size_t i = readBits(detail::get::value); + return i + decltype(detail::makeVLTransform(*this, detail::Slice{}))::bias; + } + + template + size_t readVDispatch(size_t width, detail::seq) + { + using ReadFn = size_t(VariableLengthEncoder::*)(); + + static constexpr ReadFn table[] = { + &VariableLengthEncoder::readV... + }; + return (this->*table[width])(); + } + + public: + + static constexpr size_t min_value = 0; + static constexpr size_t max_value = decltype(detail::makeVLTransform(std::declval(), BitSeqs{}))::bias - 1; + + template + VariableLengthEncoder(Args&&... args) + : stream( std::forward(args)... ) + { + } + + void write(size_t i) + { + detail::makeVLTransform(*this, BitSeqs{}).encode(i); + } + + size_t read() + { + constexpr size_t maxPrefixWidth = detail::SeqSize::value - 1; + size_t i = readBits(maxPrefixWidth); + size_t prefixWidth = detail::getPrefixWidth(i); + bitPos -= maxPrefixWidth - std::min(prefixWidth + 1, maxPrefixWidth); + return readVDispatch(prefixWidth, detail::gen_seq::value>{}); + } + + void flush(bool full = false) + { + stream.write((const char*)buf.data(), full ? (bufSize * sizeof(Packet)) : ((bitPos + packetBits - 1) / packetBits * sizeof(Packet))); + std::fill(buf.begin(), buf.end(), 0); + } + + Stream& getStream() { return stream; } + const Stream& getStream() const { return stream; } + }; + + template + class VariableLengthDecoder : public VariableLengthEncoder + { + public: + template + VariableLengthDecoder(Args&&... args) + : VariableLengthEncoder( std::forward(args)... ) + { + this->fetch(); + } + }; + } +} diff --git a/include/kiwi/BitUtils.h b/include/kiwi/BitUtils.h new file mode 100644 index 0000000..924eafb --- /dev/null +++ b/include/kiwi/BitUtils.h @@ -0,0 +1,110 @@ +#pragma once +#include + +#if defined(__SSE2__) || defined(__AVX2__) + #include +#endif + +namespace kiwi +{ + namespace utils + { + inline int countTrailingZeroes(uint32_t v) + { + if (v == 0) + { + return 32; + } +#if defined(__GNUC__) + return __builtin_ctz(v); +#elif defined(_MSC_VER) + unsigned long count; + _BitScanForward(&count, v); + return (int)count; +#else + // See Stanford bithacks, count the consecutive zero bits (trailing) on the + // right with multiply and lookup: + // http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup + static const uint8_t tbl[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, + 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, + 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; + return (int)tbl[((uint32_t)((v & -v) * 0x077CB531U)) >> 27]; +#endif + } + + inline int countTrailingZeroes(uint64_t v) + { + if (v == 0) + { + return 64; + } +#if defined(__GNUC__) + return __builtin_ctzll(v); +#elif defined(_MSC_VER) && defined(_M_X64) + unsigned long count; + _BitScanForward64(&count, v); + return (int)count; +#else + return (uint32_t)v ? countTrailingZeroes((uint32_t)v) + : 32 + countTrailingZeroes((uint32_t)(v >> 32)); +#endif + } + + inline int countLeadingZeroes(uint32_t v) + { + if (v == 0) + { + return 32; + } +#if defined(__GNUC__) + return __builtin_clz(v); +#elif defined(_MSC_VER) + unsigned long count; + _BitScanReverse(&count, v); + // BitScanReverse gives the bit position (0 for the LSB, then 1, etc.) of the + // first bit that is 1, when looking from the MSB. To count leading zeros, we + // need to adjust that. + return 31 - int(count); +#else + // See Stanford bithacks, find the log base 2 of an N-bit integer in + // O(lg(N)) operations with multiply and lookup: + // http://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn + static const uint8_t tbl[32] = { 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, + 13, 9, 6, 28, 1, 23, 19, 11, 3, 16, 14, + 7, 24, 12, 4, 8, 25, 5, 26, 27, 0 }; + v = v | (v >> 1); + v = v | (v >> 2); + v = v | (v >> 4); + v = v | (v >> 8); + v = v | (v >> 16); + return (int)tbl[((uint32_t)(v * 0x07C4ACDDU)) >> 27]; +#endif + } + + inline int countLeadingZeroes(uint64_t v) + { + if (v == 0) + { + return 64; + } +#if defined(__GNUC__) + return __builtin_clzll(v); +#elif defined(_MSC_VER) && defined(_M_X64) + unsigned long count; + _BitScanReverse64(&count, v); + return 63 - int(count); +#else + return v >> 32 ? countLeadingZeroes((uint32_t)(v >> 32)) + : 32 + countLeadingZeroes((uint32_t)v); +#endif + } + + inline int ceilLog2(uint32_t v) { return 32 - countLeadingZeroes(v - 1); } + + inline int ceilLog2(uint64_t v) { return 64 - countLeadingZeroes(v - 1); } + +#ifdef __APPLE__ + inline int ceilLog2(size_t v) { return ceilLog2((uint64_t)v); } +#endif + } +} \ No newline at end of file diff --git a/include/kiwi/FixedVector.hpp b/include/kiwi/FixedVector.hpp new file mode 100644 index 0000000..6102180 --- /dev/null +++ b/include/kiwi/FixedVector.hpp @@ -0,0 +1,77 @@ +#pragma once +#include + +namespace kiwi +{ + template + class FixedVector + { + void* _data = nullptr; + public: + FixedVector(size_t s = 0) + { + if (s) + { + _data = std::malloc(sizeof(Ty) * s + sizeof(size_t)); + *(size_t*)_data = s; + for (size_t i = 0; i < s; ++i) + { + new (&operator[](i)) Ty; + } + } + else _data = nullptr; + } + + FixedVector(const FixedVector& o) + { + if (!o.empty()) + { + _data = std::malloc(sizeof(Ty) * o.size() + sizeof(size_t)); + *(size_t*)_data = o.size(); + for (size_t i = 0; i < o.size(); ++i) + { + new (&operator[](i)) Ty{ o[i] }; + } + } + } + + FixedVector(FixedVector&& o) + { + std::swap(_data, o._data); + } + + ~FixedVector() + { + if (!_data) return; + for (auto& p : *this) p.~Ty(); + std::free(_data); + } + + FixedVector& operator=(const FixedVector& o) + { + this->~FixedVector(); + new (this) FixedVector(o); + return *this; + } + + FixedVector& operator=(FixedVector&& o) + { + std::swap(_data, o._data); + return *this; + } + + size_t size() const { return _data ? *(const size_t*)_data : 0; } + bool empty() const { return !size(); } + + Ty* data() { return _data ? (Ty*)((size_t*)_data + 1) : nullptr; } + const Ty* data() const { return _data ? (const Ty*)((const size_t*)_data + 1) : nullptr; } + + Ty* begin() { return data(); } + Ty* end() { return data() + size(); } + const Ty* begin() const { return data(); } + const Ty* end() const { return data() + size(); } + + Ty& operator[](size_t i) { return data()[i]; } + const Ty& operator[](size_t i) const { return data()[i]; } + }; +} \ No newline at end of file diff --git a/include/kiwi/Form.h b/include/kiwi/Form.h new file mode 100644 index 0000000..2e3150a --- /dev/null +++ b/include/kiwi/Form.h @@ -0,0 +1,189 @@ +/** + * @file Form.h + * @author bab2min (bab2min@gmail.com) + * @brief 형태 및 형태소에 관한 정보를 담는 구조체들이 선언된 헤더 + * @version 0.10.0 + * @date 2021-09-10 + * + * + */ + +#pragma once + +#include +#include + +namespace kiwi +{ + struct Morpheme; + + /** + * @brief 형태소에 관한 모든 정보를 담는 구조체의 템플릿 + * + * @note 변경가능한 상태로 인덱스와 관련된 값이나 std::vector 등의 길이를 변경할 수 있음. + * `kiwi::KiwiBuilder`에서 사용한다. + * `baked = true`는 변경 불가능한 상태로 인덱스는 모두 포인터로, std::vector는 FixedVector로 변경되어 수정이 불가능한 대신 + * 각 값에 효율적으로 빠르게 접근 가능하다. 이 상태는 `kiwi::Morpheme`이라는 타입의 부모클래스로 쓰이며, + * `kiwi::Kiwi` 내 실제 형태소 분석 단계에 쓰인다. + */ + struct MorphemeRaw + { + uint32_t kform = 0; /**< 형태에 대한 포인터 */ + POSTag tag = POSTag::unknown; /**< 품사 태그 */ + CondVowel vowel = CondVowel::none; /**< 선행형태소의 자/모음 조건 */ + CondPolarity polar = CondPolarity::none; /**< 선행형태소의 모음조화 조건 */ + + /** + * @brief 형태소가 두 부분으로 분할된 경우 결합 번호를 표기하기 위해 사용된다. + * + * @note `덥/VA`, `춥/VA` 등의 형태소는 `어/EC`와 만나면 `더워`, `추워`와 같이 형태가 변화한다. + * 이 경우를 각각 처리하기 보다는 `더/V + ㅂ/V`, `추/V + ㅂ/V`과 같이 분해하면 + * `ㅂ/V` + `어/EC`가 `워`로 변한다는 규칙만으로 처리가 가능해진다. (이 규칙은 `chunks`를 이용해 형태소 정보에 담길 수 있음) + * 그러나 모든 ㅂ으로 끝나는 형태소가 위와 같은 규칙에 결합되면 안된다. + * 예를 들어 `굽/VA`의 경우 `어/EC`와 만나도 `굽어`라고 형태가 유지되기 때문. + * 따라서 `ㅂ/V`이 결합할 수 있는 조건을 명시해서 이 조건과 맞는 경우에만 `더/V + ㅂ/V` -> `덥/VA`과 같이 복원해야 한다. + * `combineSocket`이 0이면 이런 결합 조건이 없는 일반 형태소임을 뜻하며, 0이 아닌 경우 결합 조건을 가지고 분해된 형태소임을 뜻한다. + * `더/V`와 `워/UNK`(`ㅂ/V + 어/EC`)는 예를 들어 3과 같이 동일한 combineSocket을 할당해 둘이 서로 결합이 가능한 형태소임을 식별한다. + */ + uint8_t combineSocket = 0; + + /** + * @brief 여러 형태소가 결합되어 형태가 변경된 경우에 원 형태소 목록을 표기하기 위해 사용된다. + * + * @note `되/VV + 어/EC`의 결합은 `돼`라는 형태로 축약될 수 있다. + * 분석과정에서 `돼`를 만난 경우 역으로 `되/VV + 어/EC`로 분석할 수 있도록 `돼/UNK`를 더미 형태소로 등록하고 + * chunks에는 `되/VV`와 `어/EC`에 대한 포인터를 넣어둔다. + */ + Vector chunks; + + /** + * @brief 분할된 형태소의 원형 형태소를 가리키는 오프셋 + * + * @note `덥/VA`이 `더/V` + `ㅂ/V`으로 분할된 경우 `더/V`는 `덥/VA`에 대한 오프셋을 combined에 저장해둔다. + * `kiwi::Morpheme::getCombined()`를 통해 원형 형태소의 포인터를 구할 수 있음 + * @sa combineSocket + */ + int32_t combined = 0; + float userScore = 0; + + MorphemeRaw(); + ~MorphemeRaw(); + MorphemeRaw(const MorphemeRaw&); + MorphemeRaw(MorphemeRaw&&); + MorphemeRaw& operator=(const MorphemeRaw&); + MorphemeRaw& operator=(MorphemeRaw&&); + + MorphemeRaw( + POSTag _tag, + CondVowel _vowel = CondVowel::none, + CondPolarity _polar = CondPolarity::none, + uint8_t _combineSocket = 0 + ); + + void serializerRead(std::istream& istr); + void serializerWrite(std::ostream& ostr) const; + }; + + /** + * @brief 형태소에 관한 모든 정보를 담는 구조체의 템플릿 + * + * @note 변경 불가능한 상태로 인덱스는 모두 포인터로, std::vector는 FixedVector로 변경되어 수정이 불가능한 대신 + * 각 값에 효율적으로 빠르게 접근 가능하다. `kiwi::Kiwi` 내 실제 형태소 분석 단계에 쓰인다. + */ + struct Morpheme + { + const KString* kform = nullptr; + POSTag tag = POSTag::unknown; + CondVowel vowel = CondVowel::none; + CondPolarity polar = CondPolarity::none; + uint8_t combineSocket = 0; + FixedVector chunks; + int32_t combined = 0; + float userScore = 0; + + Morpheme(); + ~Morpheme(); + Morpheme(const Morpheme&); + Morpheme(Morpheme&&); + Morpheme& operator=(const Morpheme&); + Morpheme& operator=(Morpheme&&); + + std::ostream& print(std::ostream& os) const; + + /** 형태소의 형태를 반환한다. */ + const KString& getForm() const { return *kform; } + + /** 분할된 형태소의 경우 원형 형태소를 반환한다. 그 외에는 자기 자신을 반환한다. */ + const Morpheme* getCombined() const { return this + combined; } + }; + + /** + * @brief 형태에 관한 모든 정보를 담는 구조체의 템플릿 + * + * @note 변경가능한 상태로 인덱스와 관련된 값이나 std::vector 등의 길이를 변경할 수 있음. `kiwi::KiwiBuilder`에서 사용한다. + * `baked = true`는 변경 불가능한 상태로 인덱스는 모두 포인터로, std::vector는 FixedVector로 변경되어 수정이 불가능한 대신 + * 각 값에 효율적으로 빠르게 접근 가능하다. 이 상태는 `kiwi::Form`이라는 타입의 부모클래스로 쓰이며, + * `kiwi::Kiwi` 내 실제 형태소 분석 단계에 쓰인다. + */ + struct FormRaw + { + KString form; /**< 형태 */ + CondVowel vowel = CondVowel::none; /**< 선행형태소의 자/모음 조건 */ + CondPolarity polar = CondPolarity::none; /**< 선행형태소의 모음조화 조건 */ + Vector candidate; + /**< 이 형태에 해당하는 형태소들의 목록 */ + + FormRaw(); + ~FormRaw(); + FormRaw(const FormRaw&); + FormRaw(FormRaw&&); + FormRaw& operator=(const FormRaw&); + FormRaw& operator=(FormRaw&&); + + FormRaw(const KString& _form, CondVowel _vowel, CondPolarity _polar); + bool operator<(const FormRaw& o) const; + + void serializerRead(std::istream& istr); + void serializerWrite(std::ostream& ostr) const; + }; + + /** + * @brief 형태에 관한 모든 정보를 담는 구조체의 템플릿 + * + * @note 변경 불가능한 상태로 인덱스는 모두 포인터로, std::vector는 FixedVector로 변경되어 수정이 불가능한 대신 + * 각 값에 효율적으로 빠르게 접근 가능하다. `kiwi::Kiwi` 내 실제 형태소 분석 단계에 쓰인다. + */ + struct Form + { + KString form; + CondVowel vowel = CondVowel::none; + CondPolarity polar = CondPolarity::none; + FixedVector candidate; + + Form(); + ~Form(); + Form(const Form&); + Form(Form&&); + Form& operator=(const Form&); + Form& operator=(Form&&); + }; + + /** + * @brief 변경가능한 형태 정보를 bake하여 최적화한다. + * + * @param o 변경 가능한 형태 정보 + * @param morphBase 형태소 배열의 시작 위치 + * @return 최적화된 형태 정보 + */ + Form bake(const FormRaw& o, const Morpheme* morphBase); + + /** + * @brief 변경 가능한 형태소 정보를 bake하여 최적화한다. + * + * @param o 변경 가능한 형태소 정보 + * @param morphBase 형태소 배열의 시작 위치 + * @param formBase 형태 배열의 시작 위치 + * @return 최적화된 형태소 정보 + */ + Morpheme bake(const MorphemeRaw& o, const Morpheme* morphBase, const Form* formBase); +} diff --git a/include/kiwi/FrozenTrie.h b/include/kiwi/FrozenTrie.h new file mode 100644 index 0000000..0752b19 --- /dev/null +++ b/include/kiwi/FrozenTrie.h @@ -0,0 +1,80 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace kiwi +{ + namespace utils + { + namespace detail + { + template + struct HasSubmatch {}; + + template + struct HasSubmatch::value>::type> + { + static constexpr Value hasSubmatch = (Value)-1; + }; + + template + struct HasSubmatch::value>::type> + { + static constexpr ptrdiff_t hasSubmatch = -1; + }; + } + + template + class FrozenTrie : public detail::HasSubmatch<_Value> + { + public: + using Key = _Key; + using Value = _Value; + using Diff = _Diff; + + struct Node + { + Key numNexts = 0; + Diff lower = 0; + uint32_t nextOffset = 0; + + const Node* next(const FrozenTrie& ft, Key c) const; + const Node* fail() const; + const Node* findFail(const FrozenTrie& ft, Key c) const; + const Value& val(const FrozenTrie& ft) const; + }; + private: + size_t numNodes = 0; + size_t numNexts = 0; + std::unique_ptr nodes; + std::unique_ptr values; + std::unique_ptr nextKeys; + std::unique_ptr nextDiffs; + + public: + + FrozenTrie() = default; + + template + FrozenTrie(const ContinuousTrie& trie); + + FrozenTrie(const FrozenTrie& o); + FrozenTrie(FrozenTrie&&) = default; + + FrozenTrie& operator=(const FrozenTrie& o); + FrozenTrie& operator=(FrozenTrie&& o) = default; + + bool empty() const { return !numNodes; } + size_t size() const { return numNodes; } + const Node* root() const { return nodes.get(); } + + const Value& value(size_t idx) const { return values[idx]; }; + }; + } +} diff --git a/include/kiwi/Kiwi.h b/include/kiwi/Kiwi.h new file mode 100644 index 0000000..6894226 --- /dev/null +++ b/include/kiwi/Kiwi.h @@ -0,0 +1,363 @@ +/** + * @file Kiwi.h + * @author bab2min (bab2min@gmail.com) + * @brief Kiwi C++ API를 담고 있는 헤더 파일 + * @version 0.10.0 + * @date 2021-08-31 + * + * + */ +#pragma once + +#include +#include +#include +#include "Macro.h" +#include "Types.h" +#include "Form.h" +#include "Utils.h" +#include "Trainer.h" +#include "Trie.hpp" +#include "PatternMatcher.h" +#include "FrozenTrie.h" +#include "Knlm.h" +#include "ThreadPool.h" +#include "WordDetector.h" + +namespace kiwi +{ + struct KTrie; + struct KGraphNode; + struct WordInfo; + + /** + * @brief 실제 형태소 분석을 수행하는 클래스. + * + */ + class Kiwi + { + friend class KiwiBuilder; + friend class PathEvaluator; + + bool integrateAllomorph = true; + float cutOffThreshold = 5; + + std::vector forms; + std::vector morphemes; + utils::FrozenTrie formTrie; + std::shared_ptr langMdl; + std::unique_ptr pool; + + std::vector analyzeSent(const std::u16string::const_iterator& sBegin, const std::u16string::const_iterator& sEnd, size_t topN, Match matchOptions) const; + + const Morpheme* getDefaultMorpheme(POSTag tag) const; + + public: + /** + * @brief 빈 Kiwi 객체를 생성한다. + * + * @note 이 생성자는 기본 생성자로 이를 통해 생성된 객체는 바로 형태소 분석에 사용할 수 없다. + * kiwi::KiwiBuilder 를 통해 생성된 객체만이 형태소 분석에 사용할 수 있다. + */ + Kiwi(); + + ~Kiwi(); + + Kiwi(const Kiwi&) = delete; + + Kiwi(Kiwi&&); + + Kiwi& operator=(const Kiwi&) = delete; + + Kiwi& operator=(Kiwi&&); + + /** + * @brief 현재 Kiwi 객체가 형태소 분석을 수행할 준비가 되었는지를 알려준다. + * + * @return 형태소 분석 준비가 완료된 경우 true를 반환한다. + * + * @note 기본 생성자를 통해 생성된 경우 언제나 `ready() == false`이며, + * `kiwi::KiwiBuilder`를 통해 생성된 경우 `ready() == true`이다. + */ + bool ready() const { return !forms.empty(); } + + /** + * @brief + * + * @param str + * @param matchOptions + * @return TokenResult + */ + TokenResult analyze(const std::u16string& str, Match matchOptions) const + { + return analyze(str, 1, matchOptions)[0]; + } + + /** + * @brief + * + * @param str + * @param matchOptions + * @return TokenResult + */ + TokenResult analyze(const std::string& str, Match matchOptions) const + { + return analyze(utf8To16(str), matchOptions); + } + + /** + * @brief + * + * @param str + * @param topN + * @param matchOptions + * @return std::vector + */ + std::vector analyze(const std::u16string& str, size_t topN, Match matchOptions) const; + + /** + * @brief + * + * @param str + * @param topN + * @param matchOptions + * @return std::vector + */ + std::vector analyze(const std::string& str, size_t topN, Match matchOptions) const + { + return analyze(utf8To16(str), topN, matchOptions); + } + + /** + * @brief + * + * @param str + * @param topN + * @param matchOptions + * @return std::future> + */ + std::future> asyncAnalyze(const std::string& str, size_t topN, Match matchOptions) const; + + /** + * @brief + * + * @tparam ReaderCallback + * @tparam ResultCallback + * @param topN + * @param reader + * @param resultCallback + * @param matchOptions + */ + template + void analyze(size_t topN, ReaderCallback&& reader, ResultCallback&& resultCallback, Match matchOptions) const + { + if (pool) + { + bool stop = false; + std::deque>> futures; + for (size_t i = 0; i < pool->size() * 2; ++i) + { + auto ustr = reader(); + if (ustr.empty()) + { + stop = true; + break; + } + futures.emplace_back(pool->enqueue([&, ustr](size_t tid) + { + return analyze(ustr, topN, matchOptions); + })); + } + + while (!futures.empty()) + { + resultCallback(futures.front().get()); + futures.pop_front(); + if (!stop) + { + auto ustr = reader(); + if (ustr.empty()) + { + stop = true; + continue; + } + futures.emplace_back(pool->enqueue([&, ustr](size_t tid) + { + return analyze(ustr, topN, matchOptions); + })); + } + } + } + else + { + while(1) + { + auto ustr = reader(); + if (ustr.empty()) break; + resultCallback(analyze(ustr, topN, matchOptions)); + } + } + } + + size_t morphToId(const Morpheme* morph) const + { + if (!morph || morph < morphemes.data()) return -1; + return morph - morphemes.data(); + } + + const Morpheme* idToMorph(size_t morphId) const + { + if (morphId >= morphemes.size()) return nullptr; + return &morphemes[morphId]; + } + + size_t getNumThreads() const + { + return pool ? 1 : pool->size(); + } + + float getCutOffThreshold() const + { + return cutOffThreshold; + } + + void setCutOffThreshold(float v) + { + cutOffThreshold = v; + } + + bool getIntegrateAllomorph() const + { + return integrateAllomorph; + } + + void setIntegrateAllomorph(bool v) + { + integrateAllomorph = v; + } + + const lm::KnLangModelBase* getLangModel() const + { + return langMdl.get(); + } + }; + + /** + * @brief 형태소 분석에 사용될 사전을 관리하고, + * 사전을 바탕으로 실제 형태소 분석을 수행하는 Kiwi의 인스턴스를 생성하는 클래스. + * + */ + class KiwiBuilder + { + std::vector forms; + std::vector morphemes; + std::unordered_map formMap; + std::shared_ptr langMdl; + size_t numThreads = 0; + WordDetector detector; + BuildOption options = BuildOption::none; + + void loadMorphBin(std::istream& is); + void saveMorphBin(std::ostream& os) const; + FormRaw& addForm(KString form, CondVowel vowel, CondPolarity polar); + + using MorphemeMap = std::unordered_map, size_t>; + void loadMMFromTxt(std::istream&& is, MorphemeMap& morphMap, std::unordered_map* posWeightSum, const std::function& selector); + void loadCMFromTxt(std::istream&& is, MorphemeMap& morphMap); + void loadPCMFromTxt(std::istream&& is, MorphemeMap& morphMap); + void addCorpusTo(Vector>& out, std::istream&& is, MorphemeMap& morphMap); + void updateForms(); + public: + struct FromRawData {}; + static constexpr FromRawData fromRawDataTag = {}; + + /** + * @brief KiwiBuilder의 기본 생성자 + * + * @note 이 생성자로 생성된 경우 `ready() == false`인 상태이므로 유효한 Kiwi 객체를 생성할 수 없다. + */ + KiwiBuilder(); + + ~KiwiBuilder(); + + KiwiBuilder(const KiwiBuilder&); + + KiwiBuilder(KiwiBuilder&&); + + KiwiBuilder& operator=(const KiwiBuilder&); + + KiwiBuilder& operator=(KiwiBuilder&&); + + /** + * @brief KiwiBuilder를 raw 데이터로부터 생성한다. + * + * @param rawDataPath + * @param numThreads + * @param options + * + * @note 이 함수는 현재 내부적으로 모델 구축에 쓰인다. + * 추후 공개 데이터로도 쉽게 직접 모델을 구축할 수 있도록 개선된 API를 제공할 예정. + */ + KiwiBuilder(FromRawData, const std::string& rawDataPath, size_t numThreads = 0, BuildOption options = BuildOption::integrateAllomorph | BuildOption::loadDefaultDict); + + /** + * @brief KiwiBuilder를 모델 파일로부터 생성한다. + * + * @param modelPath 모델이 위치한 경로 + * @param numThreads 모델 및 형태소 분석에 사용할 스레드 개수 + * @param options 생성 옵션. `kiwi::BuildOption`을 참조 + */ + KiwiBuilder(const std::string& modelPath, size_t numThreads = 0, BuildOption options = BuildOption::integrateAllomorph | BuildOption::loadDefaultDict); + + /** + * @brief 현재 KiwiBuilder 객체가 유효한 분석 모델을 로딩한 상태인지 알려준다. + * + * @return 유효한 상태면 true를 반환한다. 기본 생성자로 생성한 경우 `ready() == false`이며, + * 다른 생성자로 생성한 경우는 `ready() == true`이다. + */ + bool ready() const + { + return !!langMdl; + } + + void saveModel(const std::string& modelPath) const; + + /** + * @brief + * + * @param str + * @param tag + * @param score + * @return + */ + bool addWord(const std::u16string& str, POSTag tag = POSTag::nnp, float score = 0); + + /** + * @brief + * + * @param dictPath + * @return + */ + size_t loadDictionary(const std::string& dictPath); + + std::vector extractWords(const U16MultipleReader& reader, + size_t minCnt = 10, size_t maxWordLen = 10, float minScore = 0.25, float posThreshold = -3, bool lmFilter = true + ) const; + + std::vector extractAddWords(const U16MultipleReader& reader, + size_t minCnt = 10, size_t maxWordLen = 10, float minScore = 0.25, float posThreshold = -3, bool lmFilter = true + ); + + /** + * @brief 현재 단어 및 사전 설정을 기반으로 Kiwi 객체를 생성한다. + * + * @return 형태소 분석 준비가 완료된 Kiwi의 객체. + */ + Kiwi build() const; + + const lm::KnLangModelBase* getLangModel() const + { + return langMdl.get(); + } + }; +} diff --git a/include/kiwi/Knlm.h b/include/kiwi/Knlm.h new file mode 100644 index 0000000..084d67c --- /dev/null +++ b/include/kiwi/Knlm.h @@ -0,0 +1,156 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "Mmap.h" + +namespace kiwi +{ + namespace lm + { + using Vid = uint16_t; + + struct Header + { + uint64_t num_nodes, node_offset, key_offset, ll_offset, gamma_offset, qtable_offset, htx_offset; + uint64_t unk_id, bos_id, eos_id, vocab_size; + uint8_t order, key_size, diff_size, quantized; + }; + + template + struct Node + { + KeyType num_nexts = 0; + DiffType lower = 0; + uint32_t next_offset = 0; + }; + + class KnLangModelBase + { + protected: + utils::MemoryObject base; + + KnLangModelBase(utils::MemoryObject&& mem) : base{ std::move(mem) } + { + } + + virtual float getLL(ptrdiff_t node_idx, size_t next) const = 0; + virtual std::vector allNextLL(ptrdiff_t node_idx) const = 0; + virtual std::vector allNextLL(ptrdiff_t node_idx, std::vector& next_node_idx) const = 0; + + public: + + virtual ~KnLangModelBase() {} + const Header& getHeader() const { return *reinterpret_cast(base.get()); } + + virtual size_t llSize() const = 0; + virtual const float* getLLBuf() const = 0; + virtual const float* getGammaBuf() const = 0; + + static std::unique_ptr create(utils::MemoryObject&& mem); + + template + static utils::MemoryOwner build(const utils::ContinuousTrie& ngram_cf, + size_t order, size_t min_cf, size_t last_min_cf, + size_t unk_id, size_t bos_id, size_t eos_id, + float unigram_alpha, size_t quantize, bool compress, + const std::vector>* bigram_list = nullptr, + const std::vector* historyTransformer = nullptr + ); + + const utils::MemoryObject& getMemory() const { return base; } + + virtual float progress(ptrdiff_t& node_idx, size_t next) const = 0; + + template + void evaluate(InTy in_first, InTy in_last, OutTy out_first) const + { + ptrdiff_t node_idx = 0; + for (; in_first != in_last; ++in_first) + { + *out_first = progress(node_idx, *in_first); + ++out_first; + } + } + + template + float sum(InTy in_first, InTy in_last, float min_score = -100) const + { + float ret = 0; + ptrdiff_t node_idx = 0; + for (; in_first != in_last; ++in_first) + { + ret += std::max(progress(node_idx, *in_first), min_score); + } + return ret; + } + + template + std::vector getNextLL(InTy in_first, InTy in_last) const + { + ptrdiff_t node_idx = 0; + for (; in_first != in_last; ++in_first) + { + progress(node_idx, *in_first); + } + return allNextLL(node_idx); + } + + template + void predict(InTy in_first, InTy in_last, OutTy out_first) const + { + ptrdiff_t node_idx = 0; + for (; in_first != in_last; ++in_first) + { + progress(node_idx, *in_first); + *out_first = allNextLL(node_idx); + ++out_first; + } + } + + template + void fillIn(PfTy prefix_first, PfTy prefix_last, SfTy suffix_first, SfTy suffix_last, OutTy out_first, bool reduce = true) const + { + ptrdiff_t node_idx = 0; + for (; prefix_first != prefix_last; ++prefix_first) + { + progress(node_idx, *prefix_first); + } + + std::vector next_node_idcs; + *out_first = allNextLL(node_idx, next_node_idcs); + + if (reduce) + { + for (size_t i = 0; i < next_node_idcs.size(); ++i) + { + auto node_idx = next_node_idcs[i]; + for (auto it = suffix_first; it != suffix_last; ++it) + { + (*out_first)[i] += progress(node_idx, *it); + } + } + } + else + { + ++out_first; + for (size_t i = 0; i < next_node_idcs.size(); ++i) + { + auto node_idx = next_node_idcs[i]; + auto out_next = out_first; + for (auto it = suffix_first; it != suffix_last; ++it) + { + (*out_next)[i] = progress(node_idx, *it); + ++out_next; + } + } + } + } + }; + } +} diff --git a/include/kiwi/Macro.h b/include/kiwi/Macro.h new file mode 100644 index 0000000..0c0acea --- /dev/null +++ b/include/kiwi/Macro.h @@ -0,0 +1,10 @@ +#pragma once + +#define KIWI_STR_HELPER(x) #x +#define KIWI_STR(x) KIWI_STR_HELPER(x) + +#define KIWI_VERSION_MAJOR 0 +#define KIWI_VERSION_MINOR 10 +#define KIWI_VERSION_PATCH 2 + +#define KIWI_VERSION_STRING KIWI_STR(KIWI_VERSION_MAJOR) "." KIWI_STR(KIWI_VERSION_MINOR) "." KIWI_STR(KIWI_VERSION_PATCH) diff --git a/include/kiwi/Mmap.h b/include/kiwi/Mmap.h new file mode 100644 index 0000000..93503c5 --- /dev/null +++ b/include/kiwi/Mmap.h @@ -0,0 +1,345 @@ +#pragma once +#include +#include + +#ifdef _WIN32 +#define NOMINMAX +#include +namespace kiwi +{ + namespace utils + { + namespace detail + { + class HandleGuard + { + HANDLE handle = nullptr; + public: + HandleGuard(HANDLE _handle = nullptr) : handle(_handle) + { + } + + HandleGuard(const HandleGuard&) = delete; + HandleGuard& operator =(const HandleGuard&) = delete; + + HandleGuard(HandleGuard&& o) noexcept + { + std::swap(handle, o.handle); + } + + HandleGuard& operator=(HandleGuard&& o) noexcept + { + std::swap(handle, o.handle); + return *this; + } + + ~HandleGuard() + { + if (handle && handle != INVALID_HANDLE_VALUE) + { + CloseHandle(handle); + handle = nullptr; + } + } + + operator HANDLE() const + { + return handle; + } + }; + } + + class MMap + { + const char* view = nullptr; + size_t len = 0; + detail::HandleGuard hFile, hFileMap; + public: + MMap(const std::string& filepath) + { + hFile = CreateFileA(filepath.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, nullptr); + if (hFile == INVALID_HANDLE_VALUE) throw std::ios_base::failure("Cannot open '" + filepath + "'"); + hFileMap = CreateFileMapping(hFile, nullptr, PAGE_READONLY, 0, 0, nullptr); + if (hFileMap == nullptr) throw std::ios_base::failure("Cannot open '" + filepath + "' Code:" + std::to_string(GetLastError())); + view = (const char*)MapViewOfFile(hFileMap, FILE_MAP_READ, 0, 0, 0); + DWORD high; + len = GetFileSize(hFile, &high); + len |= (size_t)high << 32; + } + + MMap(const MMap&) = delete; + MMap& operator=(const MMap&) = delete; + + MMap(MMap&&) = default; + MMap& operator=(MMap&&) = default; + + ~MMap() + { + if (hFileMap) + { + UnmapViewOfFile(view); + hFileMap.~HandleGuard(); + } + } + + const char* get() const { return view; } + size_t size() const { return len; } + }; + } +} +#else +#include +#include +#include +#include +#include + +namespace kiwi +{ + namespace utils + { + namespace detail + { + class FDGuard + { + int fd = 0; + public: + FDGuard(int _fd = 0) : fd(_fd) + { + } + + FDGuard(const FDGuard&) = delete; + FDGuard& operator =(const FDGuard&) = delete; + + FDGuard(FDGuard&& o) + { + std::swap(fd, o.fd); + } + + FDGuard& operator=(FDGuard&& o) + { + std::swap(fd, o.fd); + return *this; + } + + ~FDGuard() + { + if (fd && fd != -1) + { + close(fd); + fd = 0; + } + } + + operator int() const + { + return fd; + } + }; + } + + class MMap + { + const char* view = nullptr; + size_t len = 0; + detail::FDGuard fd; + public: + MMap(const std::string& filepath) + { + fd = open(filepath.c_str(), O_RDONLY); + if (fd == -1) throw std::ios_base::failure("Cannot open '" + filepath + "'"); + struct stat sb; + if (fstat(fd, &sb) < 0) throw std::ios_base::failure("Cannot open '" + filepath + "'"); + len = sb.st_size; + view = (const char*)mmap(nullptr, len, PROT_READ, MAP_PRIVATE, fd, 0); + if (view == MAP_FAILED) throw std::ios_base::failure("Mapping failed"); + } + + MMap(const MMap&) = delete; + MMap& operator=(const MMap&) = delete; + + MMap(MMap&& o) + { + std::swap(view, o.view); + } + + MMap& operator=(MMap&& o) + { + std::swap(view, o.view); + return *this; + } + + ~MMap() + { + if (view) + { + munmap((void*)view, len); + } + } + + const char* get() const { return view; } + size_t size() const { return len; } + }; + } +} +#endif + +#include +#include + +namespace kiwi +{ + namespace utils + { + class MemoryOwner + { + std::unique_ptr _ptr; + size_t _size = 0; + + public: + MemoryOwner() = default; + MemoryOwner(size_t tot_size) + : _ptr{ new char[tot_size] }, _size{ tot_size } + { + } + + void* get() const { return _ptr.get(); } + size_t size() const { return _size; } + }; + + class MemoryObject + { + struct Concept + { + virtual ~Concept() {}; + virtual const void* get() const = 0; + virtual size_t size() const = 0; + }; + + template + struct Model : Concept + { + private: + Ty obj; + public: + Model(const Ty& t) : obj{ t } {} + Model(Ty&& t) : obj{ std::move(t) } {} + + virtual const void* get() const { return obj.get(); } + virtual size_t size() const { return obj.size(); } + }; + + std::shared_ptr obj; + + public: + template + MemoryObject(const Ty& _obj) : obj{ std::make_shared>(std::move(_obj)) } {} + + template + MemoryObject(Ty&& _obj) : obj{ std::make_shared::type>>(std::forward(_obj)) } {} + + MemoryObject(const MemoryObject&) = default; + MemoryObject(MemoryObject&&) = default; + + const void* get() const { return obj->get(); } + size_t size() const { return obj->size(); } + }; + + template + struct membuf : public std::streambuf + { + membuf(char* base, std::ptrdiff_t n) + { + if (read) + { + this->setg(base, base, base + n); + } + + if (write) + { + this->setp(base, base + n); + } + } + + pos_type seekpos(pos_type sp, std::ios_base::openmode which) override { + return seekoff(sp - pos_type(off_type(0)), std::ios_base::beg, which); + } + + pos_type seekoff(off_type off, + std::ios_base::seekdir dir, + std::ios_base::openmode which = std::ios_base::in + ) override { + if (which & std::ios_base::in) + { + if (dir == std::ios_base::cur) + gbump(off); + else if (dir == std::ios_base::end) + setg(eback(), egptr() + off, egptr()); + else if (dir == std::ios_base::beg) + setg(eback(), eback() + off, egptr()); + } + if (which & std::ios_base::out) + { + if (dir == std::ios_base::cur) + pbump(off); + else if (dir == std::ios_base::end) + setp(epptr() + off, epptr()); + else if (dir == std::ios_base::beg) + setp(pbase() + off, epptr()); + } + return gptr() - eback(); + } + + const char* curptr() const + { + return this->gptr(); + } + }; + + class imstream : public std::istream + { + membuf buf; + public: + imstream(const char* base, std::ptrdiff_t n) + : std::istream(&buf), buf((char*)base, n) + { + } + + template + imstream(const Ty& m) : imstream(m.get(), m.size()) + { + } + + const char* curptr() const + { + return buf.curptr(); + } + }; + + class omstream : public std::ostream + { + membuf buf; + public: + omstream(char* base, std::ptrdiff_t n) + : std::ostream(&buf), buf((char*)base, n) + { + } + + template + omstream(const Ty& m) : omstream(m.get(), m.size()) + { + } + }; + + template + Ty read(std::istream& istr) + { + Ty ret; + if (!istr.read((char*)&ret, sizeof(Ty))) + { + throw std::ios_base::failure(std::string{ "reading type '" } + typeid(Ty).name() + "' failed"); + } + return ret; + } + } +} diff --git a/include/kiwi/PatternMatcher.h b/include/kiwi/PatternMatcher.h new file mode 100644 index 0000000..a694537 --- /dev/null +++ b/include/kiwi/PatternMatcher.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include +#include "Types.h" + +namespace kiwi +{ + enum class Match : size_t + { + none = 0, + url = 1 << 0, + email = 1 << 1, + hashtag = 1 << 2, + mention = 1 << 3, + normalizeCoda = 1 << 16, + all = url | email | hashtag | mention, + allWithNormalizing = all | normalizeCoda, + }; + + std::pair matchPattern(const char16_t* first, const char16_t* last, Match matchOptions); +} + +KIWI_DEFINE_ENUM_FLAG_OPERATORS(kiwi::Match); diff --git a/include/kiwi/ThreadPool.h b/include/kiwi/ThreadPool.h new file mode 100644 index 0000000..a218813 --- /dev/null +++ b/include/kiwi/ThreadPool.h @@ -0,0 +1,109 @@ +#pragma once + +/* +A simple C++11 Thread Pool implementation(https://github.com/progschj/ThreadPool) +modified by bab2min to have additional parameter threadId +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace kiwi +{ + namespace utils + { + class ThreadPool + { + public: + ThreadPool(size_t threads = 0, size_t maxQueued = 0); + template + auto enqueue(F&& f, Args&&... args) + ->std::future::type>; + ~ThreadPool(); + size_t size() const { return workers.size(); } + size_t numEnqueued() const { return tasks.size(); } + void joinAll(); + private: + std::vector workers; + std::queue> tasks; + + std::mutex queue_mutex; + std::condition_variable condition, inputCnd; + size_t maxQueued; + bool stop; + }; + + inline ThreadPool::ThreadPool(size_t threads, size_t _maxQueued) + : stop(false), maxQueued(_maxQueued) + { + for (size_t i = 0; i < threads; ++i) + workers.emplace_back([this, i] + { + for (;;) + { + std::function task; + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait(lock, + [this] { return this->stop || !this->tasks.empty(); }); + if (this->stop && this->tasks.empty()) return; + task = std::move(this->tasks.front()); + this->tasks.pop(); + if (this->maxQueued) this->inputCnd.notify_all(); + } + task(i); + } + }); + } + + template + auto ThreadPool::enqueue(F&& f, Args&&... args) + -> std::future::type> + { + using return_type = typename std::result_of::type; + + auto task = std::make_shared< std::packaged_task >( + std::bind(std::forward(f), std::placeholders::_1, std::forward(args)...)); + + std::future res = task->get_future(); + { + std::unique_lock lock(queue_mutex); + + // don't allow enqueueing after stopping the pool + if (stop) throw std::runtime_error("enqueue on stopped ThreadPool"); + if (maxQueued && tasks.size() >= maxQueued) + { + inputCnd.wait(lock, [&]() { return tasks.size() < maxQueued; }); + } + tasks.emplace([task](size_t id) { (*task)(id); }); + } + condition.notify_one(); + return res; + } + + inline void ThreadPool::joinAll() + { + if (stop) return; + + { + std::unique_lock lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for (std::thread& worker : workers) + worker.join(); + } + + inline ThreadPool::~ThreadPool() + { + joinAll(); + } + } +} diff --git a/include/kiwi/Trainer.h b/include/kiwi/Trainer.h new file mode 100644 index 0000000..73b4b86 --- /dev/null +++ b/include/kiwi/Trainer.h @@ -0,0 +1 @@ +#pragma once diff --git a/include/kiwi/Trie.hpp b/include/kiwi/Trie.hpp new file mode 100644 index 0000000..daf66ae --- /dev/null +++ b/include/kiwi/Trie.hpp @@ -0,0 +1,333 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace kiwi +{ + namespace utils + { + template + class ConstAccess : public _Map + { + public: + auto operator[](typename _Map::key_type key) const -> typename _Map::mapped_type + { + auto it = this->find(key); + if (it == this->end()) return {}; + else return it->second; + } + + auto operator[](typename _Map::key_type key) -> typename _Map::mapped_type& + { + auto it = this->find(key); + if (it == this->end()) return this->emplace(key, typename _Map::mapped_type{}).first->second; + else return it->second; + } + }; + + template + class TrieIterator : public _Map::const_iterator + { + using Base = typename _Map::const_iterator; + using Key = typename _Map::key_type; + const _Node* base = nullptr; + public: + + TrieIterator(const Base& it, const _Node* _base) + : Base(it), base(_base) + { + } + + std::pair operator*() const + { + auto p = Base::operator*(); + return std::make_pair(p.first, base + p.second); + } + }; + + template>, class _Trie = void> + struct TrieNode + { + using Node = typename std::conditional::value, TrieNode, _Trie>::type; + using Key = _Key; + using Value = _Value; + using KeyStore = _KeyStore; + using iterator = TrieIterator<_KeyStore, Node>; + _KeyStore next = {}; + _Value val = {}; + int32_t fail = 0; + uint32_t depth = 0; + + TrieNode() {} + ~TrieNode() {} + + Node* getNext(_Key i) const + { + return next[i] ? (Node*)this + next[i] : nullptr; + } + + Node* getFail() const + { + return fail ? (Node*)this + fail : nullptr; + } + + iterator begin() const + { + return { next.begin(), (const Node*)this }; + } + + iterator end() const + { + return { next.end(), (const Node*)this }; + } + + template + Node* build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc) + { + if (first == last) + { + if (!val) val = _val; + return (Node*)this; + } + + auto v = *first; + if (!getNext(v)) + { + next[v] = alloc() - (Node*)this; + getNext(v)->depth = depth + 1; + } + return getNext(v)->build(++first, last, _val, alloc); + } + + template + Node* findNode(_TyIter begin, _TyIter end) + { + if (begin == end) return (Node*)this; + auto n = getNext(*begin); + if (n) return n->findNode(++begin, end); + return nullptr; + } + + template + void traverse(_Func func) + { + if (val) + { + if (func(val)) return; + } + for (auto& p : next) + { + if (getNext(p.first)) + { + getNext(p.first)->traverse(func); + } + } + return; + } + + template + void traverseWithKeys(_Fn&& fn, std::vector<_CKey>& rkeys, size_t maxDepth = -1, bool ignoreNegative = false) const + { + fn((Node*)this, rkeys); + + if (rkeys.size() >= maxDepth) return; + + for (auto& p : next) + { + if (ignoreNegative ? (p.second > 0) : (p.second)) + { + rkeys.emplace_back(p.first); + getNext(p.first)->traverseWithKeys(fn, rkeys, maxDepth, ignoreNegative); + rkeys.pop_back(); + } + } + } + + template + std::pair findMaximumMatch(_Iterator begin, _Iterator end, size_t idxCnt = 0) const + { + if (begin == end) return std::make_pair((Node*)this, idxCnt); + auto n = getNext(*begin); + if (n) + { + auto v = n->findMaximumMatch(++begin, end, idxCnt + 1); + if (v.first->val) return v; + } + return std::make_pair((Node*)this, idxCnt); + } + + Node* findFail(_Key i) const + { + if (!fail) // if this is Root + { + return (Node*)this; + } + else + { + if (getFail()->getNext(i)) // if 'i' node exists + { + return getFail()->getNext(i); + } + else // or loop for failure of this + { + return getFail()->findFail(i); + } + } + } + + void fillFail(bool ignoreNegative = false) + { + std::deque dq; + for (dq.emplace_back((Node*)this); !dq.empty(); dq.pop_front()) + { + auto p = dq.front(); + for (auto&& kv : p->next) + { + auto i = kv.first; + if (ignoreNegative && kv.second < 0) continue; + if (!p->getNext(i)) continue; + p->getNext(i)->fail = p->findFail(i) - p->getNext(i); + dq.emplace_back(p->getNext(i)); + + if (!p->val) + { + for (auto n = p; n->fail; n = n->getFail()) + { + if (!n->val) continue; + p->val = (_Value)-1; + break; + } + } + } + } + } + }; + + template>> + struct TrieNodeEx : public TrieNode<_Key, _Value, _KeyStore, TrieNodeEx<_Key, _Value, _KeyStore>> + { + int32_t parent = 0; + + template + TrieNodeEx* build(_TyIter first, _TyIter last, const _Value& _val, _FnAlloc&& alloc) + { + if (first == last) + { + if (!this->val) this->val = _val; + return this; + } + + auto v = *first; + if (!this->getNext(v)) + { + this->next[v] = alloc() - this; + this->getNext(v)->parent = -this->next[v]; + } + return this->getNext(v)->build(++first, last, _val, alloc); + } + + template + TrieNodeEx* makeNext(const _Key& k, _FnAlloc&& alloc) + { + if (!this->next[k]) + { + this->next[k] = alloc() - this; + this->getNext(k)->parent = -this->next[k]; + auto f = this->getFail(); + if (f) + { + f = f->makeNext(k, std::forward<_FnAlloc>(alloc)); + this->getNext(k)->fail = f - this->getNext(k); + } + else + { + this->getNext(k)->fail = this - this->getNext(k); + } + } + return this + this->next[k]; + } + + TrieNodeEx* getParent() const + { + if (!parent) return nullptr; + return (TrieNodeEx*)this + parent; + } + }; + + template + class ContinuousTrie + { + std::vector<_TrieNode> nodes; + + public: + using Node = _TrieNode; + //using Key = typename Node::Key; + //using Value = typename Node::Value; + + ContinuousTrie() = default; + ContinuousTrie(size_t initSize) : nodes(initSize) {} + ContinuousTrie(size_t initSize, size_t initReserve) + { + nodes.reserve(initReserve); + nodes.resize(initSize); + } + + ContinuousTrie(const ContinuousTrie&) = default; + ContinuousTrie(ContinuousTrie&&) = default; + + ContinuousTrie& operator=(const ContinuousTrie&) = default; + ContinuousTrie& operator=(ContinuousTrie&&) = default; + + bool empty() const { return nodes.empty(); } + size_t size() const { return nodes.size(); } + + auto begin() -> decltype(nodes.begin()) { return nodes.begin(); } + auto begin() const -> decltype(nodes.begin()) { return nodes.begin(); } + auto end() -> decltype(nodes.end()) { return nodes.end(); } + auto end() const -> decltype(nodes.end()) { return nodes.end(); } + + void reserveMore(size_t n) + { + if (nodes.capacity() < nodes.size() + n) + { + nodes.reserve(std::max(nodes.size() + n, nodes.capacity() + nodes.capacity() / 2)); + } + } + + Node& operator[](size_t idx) { return nodes[idx]; } + const Node& operator[](size_t idx) const { return nodes[idx]; } + + Node& root() { return nodes[0]; } + const Node& root() const { return nodes[0]; } + + Node* newNode() + { + nodes.emplace_back(); + return &nodes.back(); + } + + template + Node* build(Iter first, Iter last, Value&& val) + { + size_t insertSize = std::distance(first, last); + reserveMore(insertSize); + + return nodes[0].build(first, last, val, [&]() { return newNode(); }); + } + + void fillFail(bool ignoreNegative = false) + { + return nodes[0].fillFail(ignoreNegative); + } + + template + void traverseWithKeys(_Fn&& fn, std::vector<_CKey>& rkeys, size_t maxDepth = -1, bool ignoreNegative = false) const + { + return nodes[0].traverseWithKeys(std::forward<_Fn>(fn), rkeys, maxDepth, ignoreNegative); + } + }; + } +} diff --git a/include/kiwi/Types.h b/include/kiwi/Types.h new file mode 100644 index 0000000..0f40d8f --- /dev/null +++ b/include/kiwi/Types.h @@ -0,0 +1,280 @@ +/** + * @file Types.h + * @author bab2min (bab2min@gmail.com) + * @brief Kiwi C++ API에 쓰이는 주요 타입들을 모아놓은 헤더 파일 + * @version 0.10.0 + * @date 2021-08-31 + * + * + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef KIWI_USE_MIMALLOC +#include +#endif + +#define KIWI_DEFINE_ENUM_FLAG_OPERATORS(Type) \ +inline Type operator~(Type a)\ +{\ + return static_cast(~static_cast::type>(a));\ +}\ +inline bool operator!(Type a)\ +{\ + return a == static_cast(0);\ +}\ +inline Type operator|(Type a, Type b)\ +{\ + return static_cast(static_cast::type>(a) | static_cast::type>(b));\ +}\ +inline Type operator&(Type a, Type b)\ +{\ + return static_cast(static_cast::type>(a) & static_cast::type>(b));\ +}\ +inline Type operator^(Type a, Type b)\ +{\ + return static_cast(static_cast::type>(a) ^ static_cast::type>(b));\ +}\ +inline Type operator|=(Type& a, Type b)\ +{\ + return reinterpret_cast(reinterpret_cast::type&>(a) |= static_cast::type>(b));\ +}\ +inline Type operator&=(Type& a, Type b)\ +{\ + return reinterpret_cast(reinterpret_cast::type&>(a) &= static_cast::type>(b));\ +}\ +inline Type operator^=(Type& a, Type b)\ +{\ + return reinterpret_cast(reinterpret_cast::type&>(a) ^= static_cast::type>(b));\ +} + +namespace kiwi +{ + typedef char16_t kchar_t; + + class Exception : public std::runtime_error + { + public: + using std::runtime_error::runtime_error; + }; + + class UnicodeException : public Exception + { + public: + using Exception::Exception; + }; + +#ifdef KIWI_USE_MIMALLOC + template + using Vector = std::vector<_Ty, mi_stl_allocator<_Ty>>; + + template + using UnorderedMap = std::unordered_map<_K, _V, std::hash<_K>, std::equal_to<_K>, mi_stl_allocator>>; + + using KString = std::basic_string, mi_stl_allocator>; + using KStringStream = std::basic_stringstream, mi_stl_allocator>; + using KcVector = Vector; + using KcScores = Vector>; +#else + /** + * @brief std::vector의 내부용 타입. mimalloc 옵션에 따라 mi_stl_allocator로부터 메모리를 할당받는다. + * + * @note Vector는 std::vector와 동일한 역할을 수행하지만, + * mimalloc 사용시 Vector가 좀 더 빠른 속도로 메모리를 할당 받을 수 있음. + * Vector와 std::vector는 섞어 쓸 수 없다. + * Kiwi 내부에서만 사용할 것이라면 Vector를, 외부로 반환해야할 값이라면 std::vector를 사용할 것. + */ + template + using Vector = std::vector<_Ty>; + + /** + * @brief std::unordered_map의 내부용 타입. mimalloc 옵션에 따라 mi_stl_allocator로부터 메모리를 할당받는다. + * + * @note UnorderMap은 std::unordered_map과 동일한 역할을 수행하지만, + * mimalloc 사용시 UnorderMap이 좀 더 빠른 속도로 메모리를 할당 받을 수 있음. + * @sa Vector + */ + template + using UnorderedMap = std::unordered_map<_K, _V>; + + /** + * @brief std::u16string의 내부용 타입. mimalloc 옵션에 따라 mi_stl_allocator로부터 메모리를 할당받는다. + * + * @note KString은 std::u16string과 동일한 역할을 수행하지만, + * mimalloc 사용시 KString이 좀 더 빠른 속도로 메모리를 할당 받을 수 있음. + * @sa Vector + */ + using KString = std::basic_string; + using KStringStream = std::basic_stringstream; + using KcVector = Vector; + using KcScores = Vector>; +#endif + + /** + * @brief 형태소 품사 태그와 관련된 열거형 + * + * @note 나머지 품사 태그에 대한 정보는 README.md 를 참조할 것. + */ + enum class POSTag : uint8_t + { + unknown, /**< 미설정 */ + nng, nnp, nnb, + vv, va, + mag, + nr, np, + vx, + mm, maj, + ic, + xpn, xsn, xsv, xsa, xr, + vcp, vcn, + sf, sp, ss, se, so, sw, + sl, sh, sn, + w_url, w_email, w_mention, w_hashtag, + jks, jkc, jkg, jko, jkb, jkv, jkq, jx, jc, + ep, ef, ec, etn, etm, + v, /**< 분할된 동사/형용사를 나타내는데 사용됨 */ + max, /**< POSTag의 총 개수를 나타내는 용도 */ + }; + + constexpr size_t defaultTagSize = (size_t)POSTag::jks; + + /** + * @brief 선행 형태소의 종성 여부 조건과 관련된 열거형 + * + */ + enum class CondVowel : uint8_t + { + none, /**< 조건이 설정되지 않음 */ + any, /**< 자음, 모음 여부와 상관 없이 등장 가능 */ + vowel, /**< 선행 형태소가 받침이 없는 경우만 등장 가능*/ + vocalic, /**< 선행 형태소가 받침이 없거나 ㄹ받침인 경우만 등장 가능*/ + vocalic_h, /**< 선행 형태소가 받침이 없거나 ㄹ, ㅎ 받침인 경우만 등장 가능 */ + non_vowel, /**< `vowel`의 부정 */ + non_vocalic, /**< `vocalic`의 부정 */ + non_vocalic_h, /**< `vocalic_h`의 부정 */ + }; + + /** + * @brief 선행 형태소의 양/음성 조건(모음 조화)과 관련된 열거형 + * + */ + enum class CondPolarity : char + { + none, /**< 조건이 설정되지 않음 */ + positive, /**< 선행 형태소가 양성(ㅏ,ㅑ,ㅗ)인 경우만 등장 가능 */ + negative, /**< 선행 형태소가 음성(그 외)인 경우만 등장 가능 */ + }; + + /** + * @brief KiwiBuilder 생성시 사용되는 비트 플래그 + * + * @sa `kiwi::KiwiBuilder` + */ + enum class BuildOption + { + none = 0, + + integrateAllomorph = 1 << 0, /**< 이형태 통합 여부를 설정한다. 이 옵션을 사용시 `아/EC, 어/EC, 여/EC` 와 같은 형태소들이 `어/EC`로 통합되어 출력된다. */ + + loadDefaultDict = 1 << 1, /**< 기본 사전(default.dict)의 로딩 여부를 설정한다. 기본 사전은 위키백과 및 나무위키의 표제어로 구성되어 있다. */ + }; + + struct Morpheme; + + /** + * @brief 분석 완료된 각 형태소들의 정보를 담는 구조체 + * + */ + struct TokenInfo + { + std::u16string str; /**< 형태 */ + uint32_t position = 0; /**< 시작 위치(UTF16 문자 기준) */ + uint16_t length = 0; /**< 길이(UTF16 문자 기준) */ + uint16_t wordPosition = 0; /**< 어절 번호(공백 기준)*/ + POSTag tag = POSTag::unknown; /**< 품사 태그 */ + const Morpheme* morph = nullptr; /**< 기타 형태소 정보에 대한 포인터 (OOV인 경우 nullptr) */ + + TokenInfo() = default; + + TokenInfo(const std::u16string& _str, + POSTag _tag = POSTag::unknown, + uint16_t _length = 0, + uint32_t _position = 0, + uint16_t _wordPosition = 0 + ) + : str{ _str }, position{ _position }, length{ _length }, wordPosition{ _wordPosition }, tag{ _tag } + { + } + + bool operator==(const TokenInfo& o) const + { + return str == o.str && tag == o.tag; + } + + bool operator!=(const TokenInfo& o) const + { + return !operator==(o); + } + }; + + struct FormCond + { + KString form; + CondVowel vowel; + CondPolarity polar; + + FormCond(); + ~FormCond(); + FormCond(const FormCond&); + FormCond(FormCond&&); + FormCond& operator=(const FormCond&); + FormCond& operator=(FormCond&&); + + FormCond(const KString& _form, CondVowel _vowel, CondPolarity _polar); + bool operator==(const FormCond& o) const; + bool operator!=(const FormCond& o) const; + }; + + /** + * @brief 분석 완료된 형태소의 목록(`std::vector`)과 점수(`float`)의 pair 타입 + * + */ + using TokenResult = std::pair, float>; + + using U16Reader = std::function; + using U16MultipleReader = std::function; +} + +namespace std +{ +#ifdef KIWI_USE_MIMALLOC + template<> + struct hash + { + size_t operator()(const kiwi::KString& s) const + { + return hash>{}({ s.begin(), s.end() }); + } + }; +#endif + + template<> + struct hash + { + size_t operator()(const kiwi::FormCond& fc) const + { + return hash{}(fc.form) ^ ((size_t)fc.vowel | ((size_t)fc.polar << 8)); + } + }; +} + +KIWI_DEFINE_ENUM_FLAG_OPERATORS(kiwi::BuildOption); diff --git a/include/kiwi/Utils.h b/include/kiwi/Utils.h new file mode 100644 index 0000000..204da91 --- /dev/null +++ b/include/kiwi/Utils.h @@ -0,0 +1,174 @@ +#pragma once +#include +#include +#include +#include "Types.h" + +namespace kiwi +{ + template::value, int>::type = 0 + > + std::unique_ptr make_unique(Args&&... args) + { + return std::unique_ptr(new T(std::forward(args)...)); + } + + template::value, int>::type = 0 + > + std::unique_ptr make_unique(size_t size) + { + return std::unique_ptr(new typename std::remove_extent::type[size]); + } + + std::u16string utf8To16(const std::string& str); + std::string utf16To8(const std::u16string& str); + + inline bool isWebTag(POSTag t) + { + return POSTag::w_url <= t && t <= POSTag::w_hashtag; + } + + POSTag toPOSTag(const std::u16string& tagStr); + const char* tagToString(POSTag t); + const kchar_t* tagToKString(POSTag t); + + inline bool isHangulCoda(int chr) + { + return 0x11A8 <= chr && chr < (0x11A7 + 28); + } + + KString normalizeHangul(const std::u16string& hangul); + std::u16string joinHangul(const KString& hangul); + + template + void split(const std::basic_string& s, BaseChr delim, OutIterator result) + { + size_t p = 0; + while (1) + { + size_t t = s.find(delim, p); + if (t == s.npos) + { + *(result++) = s.substr(p); + break; + } + else + { + *(result++) = s.substr(p, t - p); + p = t + 1; + } + } + } + + template + inline std::vector> split(const std::basic_string& s, BaseChr delim) + { + std::vector> elems; + split(s, delim, std::back_inserter(elems)); + return elems; + } + + + template + inline float stof(ChrIterator begin, ChrIterator end) + { + if (begin == end) return 0; + bool sign = false; + switch (*begin) + { + case '-': + sign = true; + case '+': + ++begin; + break; + } + double up = 0, down = 0; + for (; begin != end; ++begin) + { + if ('0' <= *begin && *begin <= '9') up = up * 10 + (*begin - '0'); + else break; + } + if (begin != end && *begin == '.') + { + ++begin; + float d = 1; + for (; begin != end; ++begin) + { + if ('0' <= *begin && *begin <= '9') + { + down = down * 10 + (*begin - '0'); + d /= 10; + } + else break; + } + up += down * d; + } + return up * (sign ? -1 : 1); + } + + inline std::ostream& operator <<(std::ostream& os, const KString& str) + { + return os << utf16To8({ str.begin(), str.end() }); + } + + POSTag identifySpecialChr(kchar_t chr); + + class SpaceSplitIterator + { + static bool isspace(char16_t c) + { + switch (c) + { + case u' ': + case u'\f': + case u'\n': + case u'\r': + case u'\t': + case u'\v': + return true; + } + return false; + } + + std::u16string::const_iterator mBegin, mChunk, mEnd; + public: + SpaceSplitIterator(const std::u16string::const_iterator& _begin = {}, const std::u16string::const_iterator& _end = {}) + : mBegin(_begin), mEnd(_end) + { + while (mBegin != mEnd && isspace(*mBegin)) ++mBegin; + mChunk = mBegin; + while (mChunk != mEnd && !isspace(*mChunk)) ++mChunk; + } + + SpaceSplitIterator& operator++() + { + mBegin = mChunk; + while (mBegin != mEnd && isspace(*mBegin)) ++mBegin; + mChunk = mBegin; + while (mChunk != mEnd && !isspace(*mChunk)) ++mChunk; + return *this; + } + + bool operator==(const SpaceSplitIterator& o) const + { + if (o.mBegin == o.mEnd) return mBegin == mEnd; + return mBegin == o.mBegin; + } + + bool operator!=(const SpaceSplitIterator& o) const + { + return !operator==(o); + } + + std::u16string operator*() const + { + return { mBegin, mChunk }; + } + + std::u16string::const_iterator strBegin() const { return mBegin; } + std::u16string::const_iterator strEnd() const { return mChunk; } + size_t strSize() const { return distance(mBegin, mChunk); } + }; +} \ No newline at end of file diff --git a/include/kiwi/WordDetector.h b/include/kiwi/WordDetector.h new file mode 100644 index 0000000..d54cc3d --- /dev/null +++ b/include/kiwi/WordDetector.h @@ -0,0 +1,57 @@ +#pragma once + +#include + +namespace kiwi +{ + struct WordInfo + { + std::u16string form; + float score, lBranch, rBranch, lCohesion, rCohesion; + uint32_t freq; + std::map posScore; + + WordInfo(std::u16string _form = {}, + float _score = 0, float _lBranch = 0, float _rBranch = 0, + float _lCohesion = 0, float _rCohesion = 0, uint32_t _freq = 0, + std::map&& _posScore = {}) + : form(_form), score(_score), lBranch(_lBranch), rBranch(_rBranch), + lCohesion(_lCohesion), rCohesion(_rCohesion), freq(_freq), posScore(_posScore) + {} + }; + + class WordDetector + { + struct Counter; + protected: + size_t numThreads = 0; + std::map, std::map> posScore; + std::map nounTailScore; + + void loadPOSModelFromTxt(std::istream& is); + void loadNounTailModelFromTxt(std::istream& is); + + void countUnigram(Counter&, const U16Reader& reader, size_t minCnt) const; + void countBigram(Counter&, const U16Reader& reader, size_t minCnt) const; + void countNgram(Counter&, const U16Reader& reader, size_t minCnt, size_t maxWordLen) const; + float branchingEntropy(const std::map& cnt, std::map::iterator it, size_t minCnt, float defaultPerp = 1.f) const; + std::map getPosScore(Counter&, const std::map& cnt, std::map::iterator it, bool coda, const std::u16string& realForm) const; + public: + + struct FromRawData {}; + static constexpr FromRawData fromRawDataTag = {}; + + WordDetector() = default; + WordDetector(const std::string& modelPath, size_t _numThreads = 0); + WordDetector(FromRawData, const std::string& modelPath, size_t _numThreads = 0); + + bool ready() const + { + return !posScore.empty(); + } + + void saveModel(const std::string& modelPath) const; + std::vector extractWords(const U16MultipleReader& reader, size_t minCnt = 10, size_t maxWordLen = 10, float minScore = 0.1f) const; + }; + +} \ No newline at end of file diff --git a/include/kiwi/capi.h b/include/kiwi/capi.h new file mode 100644 index 0000000..cbdd31a --- /dev/null +++ b/include/kiwi/capi.h @@ -0,0 +1,460 @@ +/** + * @file capi.h + * @author bab2min (bab2min@gmail.com) + * @brief Kiwi C API를 담고 있는 헤더 파일 + * @version 0.10.0 + * @date 2021-08-31 + * + * + */ + +#pragma once + +#include "Macro.h" + +#define KIWIERR_FAIL -1 +#define KIWIERR_INVALID_HANDLE -2 +#define KIWIERR_INVALID_INDEX -3 + +#if !defined(DLL_EXPORT) +#define DECL_DLL +#elif defined(_MSC_VER) +#define DECL_DLL __declspec(dllexport) +#elif defined(__GNUC__) +#define DECL_DLL __attribute__((visibility("default"))) +#endif + +typedef struct kiwi_s* kiwi_h; +typedef struct kiwi_builder* kiwi_builder_h; +typedef struct kiwi_res* kiwi_res_h; +typedef struct kiwi_ws* kiwi_ws_h; +typedef unsigned short kchar16_t; + +/* +int (*kiwi_reader_t)(int id, char* buffer, void* user_data) +id: id number of line to be read. if id == 0, kiwi_reader should roll back file and read lines from the beginning +buffer: buffer where string data should be stored. if buffer == null, kiwi_reader provide the length of string as return value. +user_data: user_data from kiwi_extract~, kiwi_perform, kiwi_analyze_m functions. +*/ + +/** + * @brief 문자열을 읽어들여 Kiwi에 제공하기 위한 콜백 함수 타입 + * + * @param int 읽어들일 문자열의 줄 번호입니다. 0부터 시작하여 차례로 1씩 증가합니다. + * @param char* 읽어들인 문자열이 저장될 버퍼의 주소입니다. 이 값이 null인 경우 버퍼의 크기를 반환해야 합니다. + * @param void* user data를 위한 인자입니다. + * + * @return int 두번째 인자가 null인 경우 읽어들일 버퍼의 크기를 반환합니다. + */ +typedef int(*kiwi_reader_t)(int, char*, void*); +typedef int(*kiwi_reader_w_t)(int, kchar16_t*, void*); + + +typedef int(*kiwi_receiver_t)(int, kiwi_res_h, void*); + +enum +{ + KIWI_BUILD_LOAD_DEFAULT_DICT = 1, + KIWI_BUILD_INTEGRATE_ALLOMORPH = 2, + KIWI_BUILD_DEFAULT = 3, +}; + +enum +{ + KIWI_NUM_THREADS = 0x8001, +}; + +enum +{ + KIWI_MATCH_URL = 1, + KIWI_MATCH_EMAIL = 2, + KIWI_MATCH_HASHTAG = 4, + KIWI_MATCH_MENTION = 8, + KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION, + KIWI_MATCH_NORMALIZE_CODA = 65536, + KIWI_MATCH_ALL_WITH_NORMALIZING = KIWI_MATCH_ALL | KIWI_MATCH_NORMALIZE_CODA, +}; + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief 설치된 Kiwi의 버전을 반환합니다. + * + * @return "major.minor.patch"로 구성되는 버전 문자열 + */ +DECL_DLL const char* kiwi_version(); + +/** + * @brief 현재 스레드에서 발생한 에러 메세지를 반환합니다. 발생한 에러가 없을 경우 nullptr를 반환합니다. + * + * @return 에러 메세지 혹은 nullptr + */ +DECL_DLL const char* kiwi_error(); + +/** + * @brief 현재 스레드의 에러 메세지를 초기화합니다. + * + * @return + */ +DECL_DLL void kiwi_clear_error(); + +/** + * @brief Kiwi Builder를 생성합니다 + * + * @param model_path 모델의 경로 + * @param num_threads 사용할 스레드의 개수. 0으로 지정시 가용한 스레드 개수를 자동으로 판단합니다. + * @param options 생성 옵션. KIWI_BUILD_* 열거형을 참조하십시오. + * @return 성공 시 Kiwi Builder의 핸들을 반환합니다. + * 실패시 nullptr를 반환하고 에러 메세지를 설정합니다. + * 에러 메세지는 kiwi_error()를 통해 확인할 수 있습니다. + */ +DECL_DLL kiwi_builder_h kiwi_builder_init(const char* model_path, int num_threads, int options); + +/** + * @brief + * + * @param handle + * @return + */ +DECL_DLL int kiwi_builder_close(kiwi_builder_h handle); + +/** + * @brief + * + * @param handle + * @param word + * @param pos + * @param score + * @return + */ +DECL_DLL int kiwi_builder_add_word(kiwi_builder_h handle, const char* word, const char* pos, float score); + +/** + * @brief + * + * @param handle + * @param dict_path + * @return + */ +DECL_DLL int kiwi_builder_load_dict(kiwi_builder_h handle, const char* dict_path); + +/** + * @brief + * + * @param handle + * @param reader + * @param user_data + * @param min_cnt + * @param max_word_len + * @param min_score + * @param pos_threshold + * @return + */ +DECL_DLL kiwi_ws_h kiwi_builder_extract_words(kiwi_builder_h handle, kiwi_reader_t reader, void* user_data, int min_cnt, int max_word_len, float min_score, float pos_threshold); + +/** + * @brief + * + * @param handle + * @param reader + * @param user_data + * @param min_cnt + * @param max_word_len + * @param min_score + * @param pos_threshold + * @return + */ +DECL_DLL kiwi_ws_h kiwi_builder_extract_add_words(kiwi_builder_h handle, kiwi_reader_t reader, void* user_data, int min_cnt, int max_word_len, float min_score, float pos_threshold); + +/** + * @brief + * + * @param handle + * @param reader + * @param user_data + * @param min_cnt + * @param max_word_len + * @param min_score + * @param pos_threshold + * @return + */ +DECL_DLL kiwi_ws_h kiwi_builder_extract_words_w(kiwi_builder_h handle, kiwi_reader_w_t reader, void* user_data, int min_cnt, int max_word_len, float min_score, float pos_threshold); + +/** + * @brief + * + * @param handle + * @param reader + * @param user_data + * @param min_cnt + * @param max_word_len + * @param min_score + * @param pos_threshold + * @return + */ +DECL_DLL kiwi_ws_h kiwi_builder_extract_add_words_w(kiwi_builder_h handle, kiwi_reader_w_t reader, void* user_data, int min_cnt, int max_word_len, float min_score, float pos_threshold); + +/** + * @brief + * + * @param handle + * @return + */ +DECL_DLL kiwi_h kiwi_builder_build(kiwi_builder_h handle); + +/** + * @brief + * + * @param model_path + * @param num_threads + * @param options + * @return + */ +DECL_DLL kiwi_h kiwi_init(const char* model_path, int num_threads, int options); + +/** + * @brief + * + * @param handle + * @param option + * @param value + * @return + */ +DECL_DLL void kiwi_set_option(kiwi_h handle, int option, int value); + +/** + * @brief + * + * @param handle + * @param option + * @return + */ +DECL_DLL int kiwi_get_option(kiwi_h handle, int option); + +/** + * @brief + * + * @param handle + * @param text + * @param top_n + * @param match_options + * @return + */ +DECL_DLL kiwi_res_h kiwi_analyze_w(kiwi_h handle, const kchar16_t* text, int top_n, int match_options); + +/** + * @brief + * + * @param handle + * @param text + * @param top_n + * @param match_options + * @return + */ +DECL_DLL kiwi_res_h kiwi_analyze(kiwi_h handle, const char* text, int top_n, int match_options); + +/** + * @brief + * + * @param handle + * @param reader + * @param receiver + * @param user_data + * @param top_n + * @param match_options + * @return + */ +DECL_DLL int kiwi_analyze_mw(kiwi_h handle, kiwi_reader_w_t reader, kiwi_receiver_t receiver, void* user_data, int top_n, int match_options); + +/** + * @brief + * + * @param handle + * @param reader + * @param receiver + * @param user_data + * @param top_n + * @param match_options + * @return + */ +DECL_DLL int kiwi_analyze_m(kiwi_h handle, kiwi_reader_t reader, kiwi_receiver_t receiver, void* user_data, int top_n, int match_options); + +/** + * @brief + * + * @param handle + * @return + */ +DECL_DLL int kiwi_close(kiwi_h handle); + +/** + * @brief + * + * @param result + * @return + */ +DECL_DLL int kiwi_res_size(kiwi_res_h result); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL float kiwi_res_prob(kiwi_res_h result, int index); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL int kiwi_res_word_num(kiwi_res_h result, int index); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL const kchar16_t* kiwi_res_form_w(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL const kchar16_t* kiwi_res_tag_w(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL const char* kiwi_res_form(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL const char* kiwi_res_tag(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL int kiwi_res_position(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL int kiwi_res_length(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @param index + * @param num + * @return + */ +DECL_DLL int kiwi_res_word_position(kiwi_res_h result, int index, int num); + +/** + * @brief + * + * @param result + * @return + */ +DECL_DLL int kiwi_res_close(kiwi_res_h result); + + +/** + * @brief + * + * @param result + * @return + */ +DECL_DLL int kiwi_ws_size(kiwi_ws_h result); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL const kchar16_t* kiwi_ws_form_w(kiwi_ws_h result, int index); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL const char* kiwi_ws_form(kiwi_ws_h result, int index); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL float kiwi_ws_score(kiwi_ws_h result, int index); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL int kiwi_ws_freq(kiwi_ws_h result, int index); + +/** + * @brief + * + * @param result + * @param index + * @return + */ +DECL_DLL float kiwi_ws_pos_score(kiwi_ws_h result, int index); + +/** + * @brief + * + * @param result + * @return + */ +DECL_DLL int kiwi_ws_close(kiwi_ws_h result); + +#ifdef __cplusplus +} +#endif diff --git a/kiwi.go b/kiwi.go index 6d0c09c..dc0159b 100644 --- a/kiwi.go +++ b/kiwi.go @@ -3,14 +3,18 @@ package kiwi /* #cgo LDFLAGS: -l kiwi +#cgo linux LDFLAGS: -L${SRCDIR}/libs/Linux_x86_64 -lkiwi +#cgo darwin LDFLAGS: -L${SRCDIR}/libs/Darwin_x86_64 -lkiwi +#cgo windows LDFLAGS: -L${SRCDIR}/libs/Windows_x86_64 -lkiwi #include #include #include // for uintptr_t -#include +#include extern int KiwiReaderBridge(int lineNumber, char *buffer, void *userData); */ + import "C" import ( diff --git a/libs/Darwin_x86_64/libkiwi.0.10.2.dylib b/libs/Darwin_x86_64/libkiwi.0.10.2.dylib new file mode 100755 index 0000000..8cfa3ee Binary files /dev/null and b/libs/Darwin_x86_64/libkiwi.0.10.2.dylib differ diff --git a/libs/Darwin_x86_64/libkiwi.0.dylib b/libs/Darwin_x86_64/libkiwi.0.dylib new file mode 120000 index 0000000..c270fb7 --- /dev/null +++ b/libs/Darwin_x86_64/libkiwi.0.dylib @@ -0,0 +1 @@ +libkiwi.0.10.2.dylib \ No newline at end of file diff --git a/libs/Darwin_x86_64/libkiwi.dylib b/libs/Darwin_x86_64/libkiwi.dylib new file mode 120000 index 0000000..0e3467f --- /dev/null +++ b/libs/Darwin_x86_64/libkiwi.dylib @@ -0,0 +1 @@ +libkiwi.0.dylib \ No newline at end of file diff --git a/libs/Darwin_x86_64/libkiwi_static.a b/libs/Darwin_x86_64/libkiwi_static.a new file mode 100644 index 0000000..380c67c Binary files /dev/null and b/libs/Darwin_x86_64/libkiwi_static.a differ diff --git a/libs/Linux_x86_64/libkiwi.so b/libs/Linux_x86_64/libkiwi.so new file mode 120000 index 0000000..3ba306a --- /dev/null +++ b/libs/Linux_x86_64/libkiwi.so @@ -0,0 +1 @@ +libkiwi.so.0 \ No newline at end of file diff --git a/libs/Linux_x86_64/libkiwi.so.0 b/libs/Linux_x86_64/libkiwi.so.0 new file mode 120000 index 0000000..d4a8435 --- /dev/null +++ b/libs/Linux_x86_64/libkiwi.so.0 @@ -0,0 +1 @@ +libkiwi.so.0.10.2 \ No newline at end of file diff --git a/libs/Linux_x86_64/libkiwi.so.0.10.2 b/libs/Linux_x86_64/libkiwi.so.0.10.2 new file mode 100755 index 0000000..3387955 Binary files /dev/null and b/libs/Linux_x86_64/libkiwi.so.0.10.2 differ diff --git a/libs/Linux_x86_64/libkiwi_static.a b/libs/Linux_x86_64/libkiwi_static.a new file mode 100644 index 0000000..376b78e Binary files /dev/null and b/libs/Linux_x86_64/libkiwi_static.a differ diff --git a/libs/Windows_x86_64/kiwi-cli-0.10.2.exe b/libs/Windows_x86_64/kiwi-cli-0.10.2.exe new file mode 100644 index 0000000..31e7017 Binary files /dev/null and b/libs/Windows_x86_64/kiwi-cli-0.10.2.exe differ diff --git a/libs/Windows_x86_64/kiwi-evaluator.exe b/libs/Windows_x86_64/kiwi-evaluator.exe new file mode 100644 index 0000000..e837b60 Binary files /dev/null and b/libs/Windows_x86_64/kiwi-evaluator.exe differ diff --git a/libs/Windows_x86_64/kiwi-test.exe b/libs/Windows_x86_64/kiwi-test.exe new file mode 100644 index 0000000..d5063fa Binary files /dev/null and b/libs/Windows_x86_64/kiwi-test.exe differ diff --git a/libs/Windows_x86_64/kiwi.dll b/libs/Windows_x86_64/kiwi.dll new file mode 100644 index 0000000..c741d6f Binary files /dev/null and b/libs/Windows_x86_64/kiwi.dll differ diff --git a/libs/Windows_x86_64/kiwi.exp b/libs/Windows_x86_64/kiwi.exp new file mode 100644 index 0000000..c1d1799 Binary files /dev/null and b/libs/Windows_x86_64/kiwi.exp differ diff --git a/libs/Windows_x86_64/kiwi.lib b/libs/Windows_x86_64/kiwi.lib new file mode 100644 index 0000000..baa596c Binary files /dev/null and b/libs/Windows_x86_64/kiwi.lib differ diff --git a/libs/Windows_x86_64/kiwi_mt_static.lib b/libs/Windows_x86_64/kiwi_mt_static.lib new file mode 100644 index 0000000..52bf4e1 Binary files /dev/null and b/libs/Windows_x86_64/kiwi_mt_static.lib differ diff --git a/libs/Windows_x86_64/kiwi_static.lib b/libs/Windows_x86_64/kiwi_static.lib new file mode 100644 index 0000000..39a5a65 Binary files /dev/null and b/libs/Windows_x86_64/kiwi_static.lib differ diff --git a/scripts/download_kiwi_into_this_repository.bash b/scripts/download_kiwi_into_this_repository.bash new file mode 100644 index 0000000..200f310 --- /dev/null +++ b/scripts/download_kiwi_into_this_repository.bash @@ -0,0 +1,74 @@ +#!/usr/bin/env bash + +set -e +set -o pipefail + +# e.g. v0.10.2 +KIWI_VERSION="$1" + +# kiwigo/scripts +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +PROJECT_DIR=$(dirname "$SCRIPT_DIR") + +OS_LIST=( + 'Linux' + 'Darwin' +) + +function convert() { + if [ "$1" == "Linux" ]; then + echo 'lnx' + elif [ "$1" == "Darwin" ]; then + echo 'mac' + fi +} + +function install_library() { + wget -qO kiwi.tgz "https://github.com/bab2min/Kiwi/releases/download/${KIWI_VERSION}/kiwi_${KIWI_OS_NAME}_x86_64_${KIWI_VERSION}.tgz" && + tar xzvf kiwi.tgz && + mkdir -p "$LIBRARY_PATH" && + mv build/libkiwi* "$LIBRARY_PATH/" && + rm -rf kiwi.tgz build + + return $? +} + +function install_header() { + wget -qO source.tgz "https://github.com/bab2min/Kiwi/archive/refs/tags/${KIWI_VERSION}.tar.gz" && + tar xzvf source.tgz && + cp -r "Kiwi-${KIWI_VERSION/v/}/include/kiwi" "$PROJECT_DIR/include/" && + rm -rf source.tgz Kiwi-* + + return $? +} + +function install_library_for_windows() { + LIBRARY_PATH="${PROJECT_DIR}/libs/Windows_x86_64" + + wget -qO kiwi.zip "https://github.com/bab2min/Kiwi/releases/download/${KIWI_VERSION}/kiwi_win_x64_${KIWI_VERSION}.zip" && + mkdir -p "$LIBRARY_PATH" && + unzip kiwi.zip -d "$LIBRARY_PATH" && + rm -rf kiwi.zip build + + return $? +} + +function main() { + + echo "Installing Kiwi version ${KIWI_VERSION:?}" + + for OS in "${OS_LIST[@]}"; do + echo "Downloading library for ${OS:?}" + + KIWI_OS_NAME=$(convert "$OS") + LIBRARY_PATH="${PROJECT_DIR}/libs/${OS}_x86_64" + + install_library + done + + install_library_for_windows + + install_header +} + +main