|
| 1 | +//===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===// |
| 2 | +// |
| 3 | +// The LLVM Compiler Infrastructure |
| 4 | +// |
| 5 | +// This file is distributed under the University of Illinois Open Source |
| 6 | +// License. See LICENSE.TXT for details. |
| 7 | +// |
| 8 | +//===----------------------------------------------------------------------===// |
| 9 | +// fuzzer::InputCorpus |
| 10 | +//===----------------------------------------------------------------------===// |
| 11 | + |
| 12 | +#ifndef LLVM_FUZZER_CORPUS |
| 13 | +#define LLVM_FUZZER_CORPUS |
| 14 | + |
| 15 | +#include "FuzzerDefs.h" |
| 16 | +#include "FuzzerIO.h" |
| 17 | +#include "FuzzerRandom.h" |
| 18 | +#include "FuzzerSHA1.h" |
| 19 | +#include "FuzzerTracePC.h" |
| 20 | +#include <algorithm> |
| 21 | +#include <numeric> |
| 22 | +#include <random> |
| 23 | +#include <unordered_set> |
| 24 | + |
| 25 | +namespace fuzzer { |
| 26 | + |
| 27 | +struct InputInfo { |
| 28 | + Unit U; // The actual input data. |
| 29 | + uint8_t Sha1[kSHA1NumBytes]; // Checksum. |
| 30 | + // Number of features that this input has and no smaller input has. |
| 31 | + size_t NumFeatures = 0; |
| 32 | + size_t Tmp = 0; // Used by ValidateFeatureSet. |
| 33 | + // Stats. |
| 34 | + size_t NumExecutedMutations = 0; |
| 35 | + size_t NumSuccessfullMutations = 0; |
| 36 | + bool MayDeleteFile = false; |
| 37 | +}; |
| 38 | + |
| 39 | +class InputCorpus { |
| 40 | + public: |
| 41 | + static const size_t kFeatureSetSize = 1 << 16; |
| 42 | + InputCorpus(const std::string &OutputCorpus) : OutputCorpus(OutputCorpus) { |
| 43 | + memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature)); |
| 44 | + memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature)); |
| 45 | + } |
| 46 | + ~InputCorpus() { |
| 47 | + for (auto II : Inputs) |
| 48 | + delete II; |
| 49 | + } |
| 50 | + size_t size() const { return Inputs.size(); } |
| 51 | + size_t SizeInBytes() const { |
| 52 | + size_t Res = 0; |
| 53 | + for (auto II : Inputs) |
| 54 | + Res += II->U.size(); |
| 55 | + return Res; |
| 56 | + } |
| 57 | + size_t NumActiveUnits() const { |
| 58 | + size_t Res = 0; |
| 59 | + for (auto II : Inputs) |
| 60 | + Res += !II->U.empty(); |
| 61 | + return Res; |
| 62 | + } |
| 63 | + size_t MaxInputSize() const { |
| 64 | + size_t Res = 0; |
| 65 | + for (auto II : Inputs) |
| 66 | + Res = std::max(Res, II->U.size()); |
| 67 | + return Res; |
| 68 | + } |
| 69 | + bool empty() const { return Inputs.empty(); } |
| 70 | + const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; } |
| 71 | + void AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile = false) { |
| 72 | + assert(!U.empty()); |
| 73 | + uint8_t Hash[kSHA1NumBytes]; |
| 74 | + if (FeatureDebug) |
| 75 | + Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures); |
| 76 | + ComputeSHA1(U.data(), U.size(), Hash); |
| 77 | + Hashes.insert(Sha1ToString(Hash)); |
| 78 | + Inputs.push_back(new InputInfo()); |
| 79 | + InputInfo &II = *Inputs.back(); |
| 80 | + II.U = U; |
| 81 | + II.NumFeatures = NumFeatures; |
| 82 | + II.MayDeleteFile = MayDeleteFile; |
| 83 | + memcpy(II.Sha1, Hash, kSHA1NumBytes); |
| 84 | + UpdateCorpusDistribution(); |
| 85 | + ValidateFeatureSet(); |
| 86 | + } |
| 87 | + |
| 88 | + bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); } |
| 89 | + bool HasUnit(const std::string &H) { return Hashes.count(H); } |
| 90 | + InputInfo &ChooseUnitToMutate(Random &Rand) { |
| 91 | + InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)]; |
| 92 | + assert(!II.U.empty()); |
| 93 | + return II; |
| 94 | + }; |
| 95 | + |
| 96 | + // Returns an index of random unit from the corpus to mutate. |
| 97 | + // Hypothesis: units added to the corpus last are more likely to be |
| 98 | + // interesting. This function gives more weight to the more recent units. |
| 99 | + size_t ChooseUnitIdxToMutate(Random &Rand) { |
| 100 | + size_t Idx = static_cast<size_t>(CorpusDistribution(Rand)); |
| 101 | + assert(Idx < Inputs.size()); |
| 102 | + return Idx; |
| 103 | + } |
| 104 | + |
| 105 | + void PrintStats() { |
| 106 | + for (size_t i = 0; i < Inputs.size(); i++) { |
| 107 | + const auto &II = *Inputs[i]; |
| 108 | + Printf(" [%zd %s]\tsz: %zd\truns: %zd\tsucc: %zd\n", i, |
| 109 | + Sha1ToString(II.Sha1).c_str(), II.U.size(), |
| 110 | + II.NumExecutedMutations, II.NumSuccessfullMutations); |
| 111 | + } |
| 112 | + } |
| 113 | + |
| 114 | + void PrintFeatureSet() { |
| 115 | + for (size_t i = 0; i < kFeatureSetSize; i++) { |
| 116 | + if(size_t Sz = GetFeature(i)) |
| 117 | + Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz); |
| 118 | + } |
| 119 | + Printf("\n\t"); |
| 120 | + for (size_t i = 0; i < Inputs.size(); i++) |
| 121 | + if (size_t N = Inputs[i]->NumFeatures) |
| 122 | + Printf(" %zd=>%zd ", i, N); |
| 123 | + Printf("\n"); |
| 124 | + } |
| 125 | + |
| 126 | + void DeleteInput(size_t Idx) { |
| 127 | + InputInfo &II = *Inputs[Idx]; |
| 128 | + if (!OutputCorpus.empty() && II.MayDeleteFile) |
| 129 | + RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1))); |
| 130 | + Unit().swap(II.U); |
| 131 | + if (FeatureDebug) |
| 132 | + Printf("EVICTED %zd\n", Idx); |
| 133 | + } |
| 134 | + |
| 135 | + bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) { |
| 136 | + assert(NewSize); |
| 137 | + Idx = Idx % kFeatureSetSize; |
| 138 | + uint32_t OldSize = GetFeature(Idx); |
| 139 | + if (OldSize == 0 || (Shrink && OldSize > NewSize)) { |
| 140 | + if (OldSize > 0) { |
| 141 | + size_t OldIdx = SmallestElementPerFeature[Idx]; |
| 142 | + InputInfo &II = *Inputs[OldIdx]; |
| 143 | + assert(II.NumFeatures > 0); |
| 144 | + II.NumFeatures--; |
| 145 | + if (II.NumFeatures == 0) |
| 146 | + DeleteInput(OldIdx); |
| 147 | + } |
| 148 | + if (FeatureDebug) |
| 149 | + Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize); |
| 150 | + SmallestElementPerFeature[Idx] = Inputs.size(); |
| 151 | + InputSizesPerFeature[Idx] = NewSize; |
| 152 | + CountingFeatures = true; |
| 153 | + return true; |
| 154 | + } |
| 155 | + return false; |
| 156 | + } |
| 157 | + |
| 158 | + size_t NumFeatures() const { |
| 159 | + size_t Res = 0; |
| 160 | + for (size_t i = 0; i < kFeatureSetSize; i++) |
| 161 | + Res += GetFeature(i) != 0; |
| 162 | + return Res; |
| 163 | + } |
| 164 | + |
| 165 | + void ResetFeatureSet() { |
| 166 | + assert(Inputs.empty()); |
| 167 | + memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature)); |
| 168 | + memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature)); |
| 169 | + } |
| 170 | + |
| 171 | +private: |
| 172 | + |
| 173 | + static const bool FeatureDebug = false; |
| 174 | + |
| 175 | + size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; } |
| 176 | + |
| 177 | + void ValidateFeatureSet() { |
| 178 | + if (!CountingFeatures) return; |
| 179 | + if (FeatureDebug) |
| 180 | + PrintFeatureSet(); |
| 181 | + for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++) |
| 182 | + if (GetFeature(Idx)) |
| 183 | + Inputs[SmallestElementPerFeature[Idx]]->Tmp++; |
| 184 | + for (auto II: Inputs) { |
| 185 | + if (II->Tmp != II->NumFeatures) |
| 186 | + Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures); |
| 187 | + assert(II->Tmp == II->NumFeatures); |
| 188 | + II->Tmp = 0; |
| 189 | + } |
| 190 | + } |
| 191 | + |
| 192 | + // Updates the probability distribution for the units in the corpus. |
| 193 | + // Must be called whenever the corpus or unit weights are changed. |
| 194 | + void UpdateCorpusDistribution() { |
| 195 | + size_t N = Inputs.size(); |
| 196 | + Intervals.resize(N + 1); |
| 197 | + Weights.resize(N); |
| 198 | + std::iota(Intervals.begin(), Intervals.end(), 0); |
| 199 | + if (CountingFeatures) |
| 200 | + for (size_t i = 0; i < N; i++) |
| 201 | + Weights[i] = Inputs[i]->NumFeatures * (i + 1); |
| 202 | + else |
| 203 | + std::iota(Weights.begin(), Weights.end(), 1); |
| 204 | + CorpusDistribution = std::piecewise_constant_distribution<double>( |
| 205 | + Intervals.begin(), Intervals.end(), Weights.begin()); |
| 206 | + } |
| 207 | + std::piecewise_constant_distribution<double> CorpusDistribution; |
| 208 | + |
| 209 | + std::vector<double> Intervals; |
| 210 | + std::vector<double> Weights; |
| 211 | + |
| 212 | + std::unordered_set<std::string> Hashes; |
| 213 | + std::vector<InputInfo*> Inputs; |
| 214 | + |
| 215 | + bool CountingFeatures = false; |
| 216 | + uint32_t InputSizesPerFeature[kFeatureSetSize]; |
| 217 | + uint32_t SmallestElementPerFeature[kFeatureSetSize]; |
| 218 | + |
| 219 | + std::string OutputCorpus; |
| 220 | +}; |
| 221 | + |
| 222 | +} // namespace fuzzer |
| 223 | + |
| 224 | +#endif // LLVM_FUZZER_CORPUS |
0 commit comments