Skip to content

Commit

Permalink
Limit sizes of frequency and 'missed' subtables.
Browse files Browse the repository at this point in the history
  • Loading branch information
ben-e-whitney committed Jun 28, 2022
1 parent 15d5710 commit 005c538
Show file tree
Hide file tree
Showing 2 changed files with 273 additions and 28 deletions.
285 changes: 259 additions & 26 deletions include/huffman.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ using Endpoints = google::protobuf::RepeatedField<google::protobuf::int64>;
using Missed = google::protobuf::RepeatedField<google::protobuf::int64>;
using Frequencies =
google::protobuf::Map<google::protobuf::uint64, google::protobuf::uint64>;
using SubtableSizes = google::protobuf::RepeatedField<google::protobuf::uint64>;

} // namespace

Expand Down Expand Up @@ -176,6 +177,221 @@ void HuffmanCode<Symbol>::recursively_set_codewords(
}
}

namespace {

//! Maximum number of elements per frequency/missed subtable.
inline constexpr std::size_t SUBTABLE_MAX_SIZE = 1 << 20;

//! A logical table split into one or more subtables of moderate size.
//!
//! The logical table can be read by chaining the subtables.
template <typename Message, typename It> struct Supertable {
// The beginning and size of a subtable.
using Segment = std::pair<It, std::size_t>;

//! Constructor.
//!
//! Construct an 'empty' `Supertable`. The data members will be given the
//! right sizes, but for the most part they will not populated. That is left
//! to derived class constructors or callers.
//!
//!\param nelements Total number of subtable entries.
//!\param nbytes_subtables Sizes in bytes of the subtables (field in
//! `pb::HuffmanHeader`). This field will be written to.
Supertable(const std::size_t nelements, SubtableSizes &nbytes_subtables)
: nsubtables((nelements + SUBTABLE_MAX_SIZE - 1) / SUBTABLE_MAX_SIZE),
subtables(nsubtables), segments(nsubtables),
nbytes_subtables(nbytes_subtables) {
nbytes_subtables.Resize(nsubtables, 0);

for (std::size_t i = 0; i + 1 < nsubtables; ++i) {
segments.at(i).second = SUBTABLE_MAX_SIZE;
}
if (nsubtables) {
// If `nelements` is an exact multiple of `SUBTABLE_MAX_SIZE` and not
// zero, we need this last size to be `SUBTABLE_MAX_SIZE`, not `0`. If
// `nelements` is zero, we won't be executing this statement.
segments.back().second = nelements % SUBTABLE_MAX_SIZE
? nelements % SUBTABLE_MAX_SIZE
: SUBTABLE_MAX_SIZE;
}
}

//! Constructor.
//!
//! Construct a `Supertable` from a collection of parsed messages. This
//! constructor leaves `segments` uninitialized. This is because `Supertable`
//! doesn't know which field of `Message` is the subtable.
//!
//!\param nbytes_subtables Sizes in bytes of the subtables (field in
//! `pb::HuffmanHeader`).
//!\param window Window into buffer containing messages to be parsed.
Supertable(SubtableSizes &nbytes_subtables, BufferWindow &window)
: nsubtables(nbytes_subtables.size()), subtables(nsubtables),
segments(nsubtables), nbytes_subtables(nbytes_subtables) {
for (std::size_t i = 0; i < nsubtables; ++i) {
subtables.at(i) = read_message<Message>(window, nbytes_subtables.Get(i));
}
}

//! Calculate and store the sizes in bytes of the subtables.
//!
//! This function should be called once the subtables are populated.
void calculate_nbytes_subtables() {
for (std::size_t i = 0; i < nsubtables; ++i) {
nbytes_subtables.Set(i, subtables.at(i).ByteSize());
}
}

//! Calculate the total size in bytes of the subtables.
//!
//! This function assumes no changes have been made to the subtables since the
//! last call to `calculate_nbytes_subtables`.
std::size_t ByteSize() const {
return std::accumulate(nbytes_subtables.begin(), nbytes_subtables.end(),
static_cast<std::size_t>(0));
}

void SerializeToArray(void *const p, const std::size_t n) const {
unsigned char *const p_ = reinterpret_cast<unsigned char *>(p);
std::size_t total = 0;
for (std::size_t i = 0; i < nsubtables; ++i) {
const Message &subtable = subtables.at(i);
const google::protobuf::uint64 nbytes_subtable = nbytes_subtables.Get(i);

subtable.SerializeToArray(p_ + total, nbytes_subtable);
total += nbytes_subtable;
}
if (total != n) {
throw std::invalid_argument("serialization buffer size incorrect");
}
}

//! Number of subtables.
std::size_t nsubtables;

//! Subtables.
//!
//! It might be better to name this member 'messages.' Elsewhere we use
//! 'subtable' to refer to the fields of the messages containing the
//! supertable elements. Using that vocabulary, a `pb::FrequencySubtable`
//! would be a message while its `frequencies` field would be the subtable.
std::vector<Message> subtables;

//! Segments for a concatenated subtable chain.
//!
//! A `Chain<std::vector<Segment>::iterator>` can be constructed from this.
std::vector<Segment> segments;

//! Sizes in bytes of the subtables.
SubtableSizes &nbytes_subtables;
};

//! A logical frequency table split into one or more subtables of moderate size.
struct FrequencySupertable
: Supertable<pb::FrequencySubtable, Frequencies::iterator> {
//! Constructor.
//!
//! Construct and populate a `FrequencySupertable` from a vector of symbol
//! frequencies.
//!
//!\param frequencies Symbol frequencies to store in the subtables.
//!\param nbytes_subtables Sizes in bytes of the subtables (field in
//! `pb::HuffmanHeader`). This field will be written to.
FrequencySupertable(const std::vector<std::size_t> &frequencies,
SubtableSizes &nbytes_subtables)
: Supertable(std::count_if(frequencies.begin(), frequencies.end(),
[](const std::size_t frequency) -> bool {
return frequency;
}),
nbytes_subtables) {
// `i` is the index of the subtable we're inserting into. (Technically
// we're inserting into the subtable's frequency map field rather than
// the subtable itself.) `j` is the number of entries we've inserted
// into subtable `i`. `k` is the index in the vector of frequencies
// passed to the constructor.
std::size_t k = 0;
for (std::size_t i = 0; i < nsubtables; ++i) {
Frequencies &frequencies_ = *subtables.at(i).mutable_frequencies();
Segment &segment = segments.at(i);
// How big `frequencies_` should be when we're done.
const std::size_t nfrequencies_ = segment.second;
for (std::size_t j = 0; j < nfrequencies_; ++k) {
const std::size_t frequency = frequencies.at(k);
if (frequency) {
frequencies_.insert({k, frequency});
++j;
}
}
segment.first = frequencies_.begin();
}

calculate_nbytes_subtables();
}

//! Constructor.
//!
//! Construct a `FrequencySubtable` from a collection of parsed messages.
//!
//!\param nbytes_subtables Sizes in bytes of the subtables (field in
//! `pb::HuffmanHeader`).
//!\param window Window into buffer containing messages to be parsed.
FrequencySupertable(SubtableSizes &nbytes_subtables, BufferWindow &window)
: Supertable(nbytes_subtables, window) {
for (std::size_t i = 0; i < nsubtables; ++i) {
Segment &segment = segments.at(i);
Frequencies &frequencies = *subtables.at(i).mutable_frequencies();

segment.first = frequencies.begin();
segment.second = frequencies.size();
}
}
};

//! A logical 'missed' table split into one or more subtables of moderate size.
struct MissedSupertable : Supertable<pb::MissedSubtable, Missed::iterator> {
//! Constructor.
//!
//! Construct an 'empty' `MissedSupertable`. It is expected that the caller
//! will subsequently write to the subtables using `Chain`.
//!
//!\param nmissed Number of missed symbols.
//!\param nbytes_subtables Sizes in bytes of the subtables (field in
//! `pb::HuffmanHeader`). This field will be written to.
MissedSupertable(const std::size_t nmissed, SubtableSizes &nbytes_subtables)
: Supertable(nmissed, nbytes_subtables) {
for (std::size_t i = 0; i < nsubtables; ++i) {
Missed &missed = *subtables.at(i).mutable_missed();
Segment &segment = segments.at(i);
// How big `missed` should be when we're done.
const std::size_t nmissed = segment.second;

missed.Resize(nmissed, 0);
segment.first = missed.begin();
}
}

//! Constructor.
//!
//! Construct a `MissedSubtable` from a collection of parsed messages.
//!
//!\param nbytes_subtables Sizes in bytes of the subtables (field in
//! `pb::HuffmanHeader`).
//!\param window Window into buffer containing messages to be parsed.
MissedSupertable(SubtableSizes &nbytes_subtables, BufferWindow &window)
: Supertable(nbytes_subtables, window) {
for (std::size_t i = 0; i < nsubtables; ++i) {
Segment &segment = segments.at(i);
Missed &missed = *subtables.at(i).mutable_missed();

segment.first = missed.begin();
segment.second = missed.size();
}
}
};

} // namespace

template <typename Symbol>
MemoryBuffer<unsigned char> huffman_encode(Symbol const *const begin,
const std::size_t n) {
Expand All @@ -188,7 +404,7 @@ MemoryBuffer<unsigned char> huffman_encode(Symbol const *const begin,
const std::size_t nbits =
std::inner_product(code.frequencies.begin(), code.frequencies.end(),
lengths.begin(), static_cast<std::size_t>(0));
const std::size_t nbytes = (nbits + CHAR_BIT - 1) / CHAR_BIT;
const std::size_t nbytes_hit = (nbits + CHAR_BIT - 1) / CHAR_BIT;

pb::HuffmanHeader header;
header.set_index_mapping(pb::HuffmanHeader::INCLUSIVE_RANGE);
Expand All @@ -200,23 +416,18 @@ MemoryBuffer<unsigned char> huffman_encode(Symbol const *const begin,
header.add_endpoints(code.endpoints.second);
header.set_nbits(nbits);

Frequencies &frequencies = *header.mutable_frequencies();
{
std::size_t i = 0;
for (const std::size_t frequency : code.frequencies) {
if (frequency) {
frequencies.insert({i, frequency});
}
++i;
}
}
FrequencySupertable frequency_supertable(
code.frequencies, *header.mutable_nbytes_frequency_subtables());
MissedSupertable missed_supertable(code.nmissed(),
*header.mutable_nbytes_missed_subtables());

Missed &missed_ = *header.mutable_missed();
missed_.Resize(code.nmissed(), 0);
Missed::iterator missed = missed_.begin();
Chain<Missed::iterator> chained_missed_supertable(missed_supertable.segments);
Chain<Missed::iterator>::iterator missed = chained_missed_supertable.begin();
// Now we're ready to populate the 'missed' subtables in the course of
// populating the 'hit' buffer.

// Zero-initialize the bytes.
unsigned char *const hit_ = new unsigned char[nbytes]();
unsigned char *const hit_ = new unsigned char[nbytes_hit]();
unsigned char *hit = hit_;

unsigned char offset = 0;
Expand Down Expand Up @@ -249,8 +460,18 @@ MemoryBuffer<unsigned char> huffman_encode(Symbol const *const begin,
}
}

// We're done writing to the 'missed' subtables, so we can now calculate their
// serialized sizes. We need to do this before calling
// `missed_supertable.ByteSize`.
missed_supertable.calculate_nbytes_subtables();

const std::uint_least64_t nheader = header.ByteSize();
MemoryBuffer<unsigned char> out(HEADER_SIZE_SIZE + nheader + nbytes);
const std::size_t nbytes_frequency_supertable =
frequency_supertable.ByteSize();
const std::size_t nbytes_missed_supertable = missed_supertable.ByteSize();
MemoryBuffer<unsigned char> out(HEADER_SIZE_SIZE + nheader +
nbytes_frequency_supertable +
nbytes_missed_supertable + nbytes_hit);
{
unsigned char *p = out.data.get();
const std::array<unsigned char, HEADER_SIZE_SIZE> nheader_ =
Expand All @@ -261,8 +482,14 @@ MemoryBuffer<unsigned char> huffman_encode(Symbol const *const begin,
header.SerializeToArray(p, nheader);
p += nheader;

std::copy(hit_, hit_ + nbytes, p);
p += nbytes;
frequency_supertable.SerializeToArray(p, nbytes_frequency_supertable);
p += nbytes_frequency_supertable;

missed_supertable.SerializeToArray(p, nbytes_missed_supertable);
p += nbytes_missed_supertable;

std::copy(hit_, hit_ + nbytes_hit, p);
p += nbytes_hit;
}

delete[] hit_;
Expand All @@ -283,19 +510,24 @@ MemoryBuffer<Symbol> huffman_decode(const MemoryBuffer<unsigned char> &buffer) {
if (endpoints_.size() != 2) {
throw std::runtime_error("received an unexpected number of endpoints");
}
const std::pair<std::size_t, std::size_t> endpoints(endpoints_.Get(0),
endpoints_.Get(1));
const std::pair<Symbol, Symbol> endpoints(endpoints_.Get(0),
endpoints_.Get(1));

if (header.codeword_mapping() != pb::HuffmanHeader::INDEX_FREQUENCY_PAIRS) {
throw std::runtime_error("unrecognized Huffman codeword mapping");
}
const Frequencies &frequencies_ = header.frequencies();
FrequencySupertable frequency_supertable(
*header.mutable_nbytes_frequency_subtables(), window);
Chain<Frequencies::iterator> chained_frequency_supertable(
frequency_supertable.segments);

if (header.missed_encoding() != pb::HuffmanHeader::LITERAL) {
throw std::runtime_error("unrecognized Huffman missed buffer encoding");
}
const Missed &missed_ = header.missed();
Missed::const_iterator missed = missed_.cbegin();
MissedSupertable missed_supertable(*header.mutable_nbytes_missed_subtables(),
window);
Chain<Missed::iterator> chained_missed_supertable(missed_supertable.segments);
Chain<Missed::iterator>::iterator missed = chained_missed_supertable.begin();

if (header.hit_encoding() != pb::HuffmanHeader::RUN_TOGETHER) {
throw std::runtime_error("unrecognized Huffman hit buffer encoding");
Expand All @@ -308,8 +540,9 @@ MemoryBuffer<Symbol> huffman_decode(const MemoryBuffer<unsigned char> &buffer) {
"number of bytes in hit buffer");
}

const HuffmanCode<Symbol> code(endpoints, frequencies_.begin(),
frequencies_.end());
const HuffmanCode<Symbol> code(endpoints,
chained_frequency_supertable.begin(),
chained_frequency_supertable.end());
// TODO: Maybe add a member function for this.
const std::size_t nout =
std::accumulate(code.frequencies.begin(), code.frequencies.end(),
Expand All @@ -332,7 +565,7 @@ MemoryBuffer<Symbol> huffman_decode(const MemoryBuffer<unsigned char> &buffer) {
*q++ = decoded.first ? decoded.second : *missed++;
}
assert(nbits_read == nbits);
assert(missed == missed_.cend());
assert(missed == chained_missed_supertable.end());

return out;
}
Expand Down
16 changes: 14 additions & 2 deletions src/mgard.proto
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,24 @@ message HuffmanHeader {

// Minimum and maximum symbols eligible for codewords.
repeated sint64 endpoints = 5;
// Sizes in bytes of serialized `FrequencySubtable`s to followw.
repeated uint64 nbytes_frequency_subtables = 6;
// Sizes in bytes of serialized `MissedSubtable`s to follow.
repeated uint64 nbytes_missed_subtables = 7;
// Size in bits of the hit buffer to follow.
uint64 nbits = 8;
}

// One or more of these will follow a `HuffmanHeader`.
message FrequencySubtable {
// Index–frequency pairs for frequency table.
map<uint64, uint64> frequencies = 6;
}

// One or more of these will follow the `FrequencySubtable`s after a `HuffmanHeader`.
message MissedSubtable {
// Encountered symbols that were not assigned codewords.
repeated sint64 missed = 7;
// Size of the hit buffer in bits.
uint64 nbits = 8;
}

message Device {
Expand Down

0 comments on commit 005c538

Please sign in to comment.