diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 705d83f29e0c..a6199bbbcbd2 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -104,7 +104,7 @@ class BinMapper { * \brief Save binary data to file * \param file File want to write */ - void SaveBinaryToFile(const VirtualFileWriter* writer) const; + void SaveBinaryToFile(BinaryWriter* writer) const; /*! * \brief Mapping bin into feature value @@ -286,7 +286,7 @@ class Bin { * \brief Save binary data to file * \param file File want to write */ - virtual void SaveBinaryToFile(const VirtualFileWriter* writer) const = 0; + virtual void SaveBinaryToFile(BinaryWriter* writer) const = 0; /*! * \brief Load from memory diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 27fb0b620a07..bba46a02a492 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -29,6 +29,7 @@ typedef void* DatasetHandle; /*!< \brief Handle of dataset. */ typedef void* BoosterHandle; /*!< \brief Handle of booster. */ typedef void* FastConfigHandle; /*!< \brief Handle of FastConfig. */ +typedef void* ByteBufferHandle; /*!< \brief Handle of ByteBuffer. */ #define C_API_DTYPE_FLOAT32 (0) /*!< \brief float32 (single precision float). */ #define C_API_DTYPE_FLOAT64 (1) /*!< \brief float64 (double precision float). */ @@ -96,6 +97,22 @@ LIGHTGBM_C_EXPORT int LGBM_SampleIndices(int32_t num_total_row, void* out, int32_t* out_len); +/*! + * \brief Get a ByteBuffer value at an index. + * \param handle Handle of byte buffer to be read + * \param index Index of value to return + * \param[out] out_val Byte value at index to return + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_ByteBufferGetAt(ByteBufferHandle handle, int32_t index, uint8_t* out_val); + +/*! + * \brief Free space for byte buffer. + * \param handle Handle of byte buffer to be freed + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_ByteBufferFree(ByteBufferHandle handle); + /* --- start Dataset interface */ /*! @@ -164,6 +181,23 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetInitStreaming(DatasetHandle dataset, int32_t nthreads, int32_t omp_max_threads); +/*! + * \brief Allocate the space for dataset and bucket feature bins according to serialized reference dataset. + * \param ref_buffer A binary representation of the dataset schema (feature groups, bins, etc.) + * \param ref_buffer_size The size of the reference array in bytes + * \param num_row Number of total rows the dataset will contain + * \param num_classes Number of classes (will be used only in case of multiclass and specifying initial scores) + * \param parameters Additional parameters + * \param[out] out Created dataset + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSerializedReference(const void* ref_buffer, + int32_t ref_buffer_size, + int64_t num_row, + int32_t num_classes, + const char* parameters, + DatasetHandle* out); + /*! * \brief Push data to existing dataset, if ``nrow + start_row == num_total_row``, will call ``dataset->FinishLoad``. * \param dataset Handle of dataset @@ -464,6 +498,17 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetFree(DatasetHandle handle); LIGHTGBM_C_EXPORT int LGBM_DatasetSaveBinary(DatasetHandle handle, const char* filename); +/*! + * \brief Create a dataset schema representation as a binary byte array (excluding data). + * \param handle Handle of dataset + * \param[out] out The output byte array + * \param[out] out_len The length of the output byte array (returned for convenience) + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_DatasetSerializeReferenceToBinary(DatasetHandle handle, + ByteBufferHandle* out, + int32_t* out_len); + /*! * \brief Save dataset to text file, intended for debugging use only. * \param handle Handle of dataset diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 97bc146c9a9e..79c4ed196b09 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -124,7 +125,7 @@ class Metadata { * \brief Save binary data to file * \param file File want to write */ - void SaveBinaryToFile(const VirtualFileWriter* writer) const; + void SaveBinaryToFile(BinaryWriter* writer) const; /*! * \brief Get sizes in byte of this object @@ -621,6 +622,11 @@ class Dataset { */ LIGHTGBM_EXPORT void SaveBinaryFile(const char* bin_filename); + /*! + * \brief Serialize the overall Dataset definition/schema to a binary buffer (i.e., without data) + */ + LIGHTGBM_EXPORT void SerializeReference(ByteBuffer* out); + LIGHTGBM_EXPORT void DumpTextFile(const char* text_filename); LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset); @@ -919,6 +925,10 @@ class Dataset { #endif // USE_CUDA private: + void SerializeHeader(BinaryWriter* serializer); + + size_t GetSerializedHeaderSize(); + void CreateCUDAColumnData(); std::string data_filename_; @@ -938,8 +948,11 @@ class Dataset { int label_idx_ = 0; /*! \brief store feature names */ std::vector feature_names_; - /*! \brief store feature names */ + /*! \brief serialized versions */ + static const int kSerializedReferenceVersionLength; + static const char* serialized_reference_version; static const char* binary_file_token; + static const char* binary_serialized_reference_token; int num_groups_; std::vector real_feature_idx_; std::vector feature2group_; diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index 8b04e8327ff0..73b8e7bfd071 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -28,6 +28,8 @@ class DatasetLoader { LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data); + LIGHTGBM_EXPORT Dataset* LoadFromSerializedReference(const char* buffer, size_t buffer_size, data_size_t num_data, int32_t num_classes); + LIGHTGBM_EXPORT Dataset* ConstructFromSampleData(double** sample_values, int** sample_indices, int num_col, @@ -45,6 +47,8 @@ class DatasetLoader { const std::unordered_set& categorical_features); private: + void LoadHeaderFromMemory(Dataset* dataset, const char* buffer); + Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices); void SetHeader(const char* filename); diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index 0ddfd857bce1..003df70afad7 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -110,14 +110,56 @@ class FeatureGroup { } /*! - * \brief Constructor from memory + * \brief Constructor from memory when data is present * \param memory Pointer of memory * \param num_all_data Number of global data * \param local_used_indices Local used indices, empty means using all data + * \param group_id Id of group */ - FeatureGroup(const void* memory, data_size_t num_all_data, + FeatureGroup(const void* memory, + data_size_t num_all_data, const std::vector& local_used_indices, int group_id) { + // Load the definition schema first + const char* memory_ptr = LoadDefinitionFromMemory(memory, group_id); + + // Allocate memory for the data + data_size_t num_data = num_all_data; + if (!local_used_indices.empty()) { + num_data = static_cast(local_used_indices.size()); + } + AllocateBins(num_data); + + // Now load the actual data + if (is_multi_val_) { + for (int i = 0; i < num_feature_; ++i) { + multi_bin_data_[i]->LoadFromMemory(memory_ptr, local_used_indices); + memory_ptr += multi_bin_data_[i]->SizesInByte(); + } + } else { + bin_data_->LoadFromMemory(memory_ptr, local_used_indices); + } + } + + /*! + * \brief Constructor from definition in memory (without data) + * \param memory Pointer of memory + * \param local_used_indices Local used indices, empty means using all data + */ + FeatureGroup(const void* memory, data_size_t num_data, int group_id) { + LoadDefinitionFromMemory(memory, group_id); + AllocateBins(num_data); + } + + /*! \brief Destructor */ + ~FeatureGroup() {} + + /*! + * \brief Load the overall definition of the feature group from binary serialized data + * \param memory Pointer of memory + * \param group_id Id of group + */ + const char* LoadDefinitionFromMemory(const void* memory, int group_id) { const char* memory_ptr = reinterpret_cast(memory); // get is_sparse is_multi_val_ = *(reinterpret_cast(memory_ptr)); @@ -128,9 +170,9 @@ class FeatureGroup { memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_)); num_feature_ = *(reinterpret_cast(memory_ptr)); memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_)); - // get bin mapper - bin_mappers_.clear(); + // get bin mapper(s) + bin_mappers_.clear(); for (int i = 0; i < num_feature_; ++i) { bin_mappers_.emplace_back(new BinMapper(memory_ptr)); memory_ptr += bin_mappers_[i]->SizesInByte(); @@ -158,22 +200,23 @@ class FeatureGroup { num_total_bin_ += num_bin; bin_offsets_.emplace_back(num_total_bin_); } - data_size_t num_data = num_all_data; - if (!local_used_indices.empty()) { - num_data = static_cast(local_used_indices.size()); - } + + return memory_ptr; + } + + /*! + * \brief Allocate the bins + * \param num_all_data Number of global data + */ + inline void AllocateBins(data_size_t num_data) { if (is_multi_val_) { for (int i = 0; i < num_feature_; ++i) { int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1; if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { - multi_bin_data_.emplace_back(Bin::CreateSparseBin( - num_data, bin_mappers_[i]->num_bin() + addi)); + multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi)); } else { - multi_bin_data_.emplace_back( - Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi)); + multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi)); } - multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices); - memory_ptr += multi_bin_data_.back()->SizesInByte(); } } else { if (is_sparse_) { @@ -181,14 +224,9 @@ class FeatureGroup { } else { bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_)); } - // get bin data - bin_data_->LoadFromMemory(memory_ptr, local_used_indices); } } - /*! \brief Destructor */ - ~FeatureGroup() {} - /*! * \brief Initialize for pushing in a streaming fashion. By default, no action needed. * \param num_thread The number of external threads that will be calling the push APIs @@ -414,10 +452,11 @@ class FeatureGroup { } /*! - * \brief Save binary data to file - * \param file File want to write + * \brief Write to binary stream + * \param writer Writer + * \param include_data Whether to write data (true) or just header information (false) */ - void SaveBinaryToFile(const VirtualFileWriter* writer) const { + void SerializeToBinary(BinaryWriter* writer, bool include_data = true) const { writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_)); writer->AlignedWrite(&is_dense_multi_val_, sizeof(is_dense_multi_val_)); writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_)); @@ -425,19 +464,22 @@ class FeatureGroup { for (int i = 0; i < num_feature_; ++i) { bin_mappers_[i]->SaveBinaryToFile(writer); } - if (is_multi_val_) { - for (int i = 0; i < num_feature_; ++i) { - multi_bin_data_[i]->SaveBinaryToFile(writer); + + if (include_data) { + if (is_multi_val_) { + for (int i = 0; i < num_feature_; ++i) { + multi_bin_data_[i]->SaveBinaryToFile(writer); + } + } else { + bin_data_->SaveBinaryToFile(writer); } - } else { - bin_data_->SaveBinaryToFile(writer); } } /*! * \brief Get sizes in byte of this object */ - size_t SizesInByte() const { + size_t SizesInByte(bool include_data = true) const { size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) + VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_)) + VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) + @@ -445,11 +487,13 @@ class FeatureGroup { for (int i = 0; i < num_feature_; ++i) { ret += bin_mappers_[i]->SizesInByte(); } - if (!is_multi_val_) { - ret += bin_data_->SizesInByte(); - } else { - for (int i = 0; i < num_feature_; ++i) { - ret += multi_bin_data_[i]->SizesInByte(); + if (include_data) { + if (!is_multi_val_) { + ret += bin_data_->SizesInByte(); + } else { + for (int i = 0; i < num_feature_; ++i) { + ret += multi_bin_data_[i]->SizesInByte(); + } } } return ret; diff --git a/include/LightGBM/utils/binary_writer.h b/include/LightGBM/utils/binary_writer.h new file mode 100644 index 000000000000..5b7604b52807 --- /dev/null +++ b/include/LightGBM/utils/binary_writer.h @@ -0,0 +1,58 @@ +/*! + * Copyright (c) 2022 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_UTILS_BINARY_WRITER_H_ +#define LIGHTGBM_UTILS_BINARY_WRITER_H_ + +#include +#include + +namespace LightGBM { + +/*! + * \brief An interface for serializing binary data to a buffer + */ +struct BinaryWriter { + /*! + * \brief Append data to this binary target + * \param data Buffer to write from + * \param bytes Number of bytes to write from buffer + * \return Number of bytes written + */ + virtual size_t Write(const void* data, size_t bytes) = 0; + + /*! + * \brief Append data to this binary target aligned on a given byte size boundary + * \param data Buffer to write from + * \param bytes Number of bytes to write from buffer + * \param alignment The size of bytes to align to in whole increments + * \return Number of bytes written + */ + size_t AlignedWrite(const void* data, size_t bytes, size_t alignment = 8) { + auto ret = Write(data, bytes); + if (bytes % alignment != 0) { + size_t padding = AlignedSize(bytes, alignment) - bytes; + std::vector tmp(padding, 0); + ret += Write(tmp.data(), padding); + } + return ret; + } + + /*! + * \brief The aligned size of a buffer length. + * \param bytes The number of bytes in a buffer + * \param alignment The size of bytes to align to in whole increments + * \return Number of aligned bytes + */ + static size_t AlignedSize(size_t bytes, size_t alignment = 8) { + if (bytes % alignment == 0) { + return bytes; + } else { + return bytes / alignment * alignment + alignment; + } + } +}; +} // namespace LightGBM + +#endif // LIGHTGBM_UTILS_BINARY_WRITER_H_ diff --git a/include/LightGBM/utils/byte_buffer.h b/include/LightGBM/utils/byte_buffer.h new file mode 100644 index 000000000000..86497dbc0969 --- /dev/null +++ b/include/LightGBM/utils/byte_buffer.h @@ -0,0 +1,62 @@ +/*! + * Copyright (c) 2022 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_UTILS_BYTE_BUFFER_H_ +#define LIGHTGBM_UTILS_BYTE_BUFFER_H_ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace LightGBM { + +/*! + * \brief An implementation for serializing binary data to an auto-expanding memory buffer + */ +struct ByteBuffer final : public BinaryWriter { + ByteBuffer() {} + + explicit ByteBuffer(size_t initial_size) { + buffer_.reserve(initial_size); + } + + size_t Write(const void* data, size_t bytes) { + const char* mem_ptr = static_cast(data); + for (size_t i = 0; i < bytes; ++i) { + buffer_.push_back(mem_ptr[i]); + } + + return bytes; + } + + LIGHTGBM_EXPORT void Reserve(size_t capacity) { + buffer_.reserve(capacity); + } + + LIGHTGBM_EXPORT size_t GetSize() { + return buffer_.size(); + } + + LIGHTGBM_EXPORT char GetAt(size_t index) { + return buffer_.at(index); + } + + LIGHTGBM_EXPORT char* Data() { + return buffer_.data(); + } + + private: + std::vector buffer_; +}; + +} // namespace LightGBM + +#endif // LightGBM_UTILS_BYTE_BUFFER_H_ diff --git a/include/LightGBM/utils/file_io.h b/include/LightGBM/utils/file_io.h index 62ec3dbdc326..f59dda590bc1 100644 --- a/include/LightGBM/utils/file_io.h +++ b/include/LightGBM/utils/file_io.h @@ -5,6 +5,8 @@ #ifndef LIGHTGBM_UTILS_FILE_IO_H_ #define LIGHTGBM_UTILS_FILE_IO_H_ +#include + #include #include #include @@ -18,50 +20,28 @@ namespace LightGBM { /*! * \brief An interface for writing files from buffers */ -struct VirtualFileWriter { +struct VirtualFileWriter : BinaryWriter { virtual ~VirtualFileWriter() {} + /*! * \brief Initialize the writer * \return True when the file is available for writes */ virtual bool Init() = 0; - /*! - * \brief Append buffer to file - * \param data Buffer to write from - * \param bytes Number of bytes to write from buffer - * \return Number of bytes written - */ - virtual size_t Write(const void* data, size_t bytes) const = 0; - size_t AlignedWrite(const void* data, size_t bytes, size_t alignment = 8) const { - auto ret = Write(data, bytes); - if (bytes % alignment != 0) { - size_t padding = AlignedSize(bytes, alignment) - bytes; - std::vector tmp(padding, 0); - ret += Write(tmp.data(), padding); - } - return ret; - } /*! * \brief Create appropriate writer for filename * \param filename Filename of the data * \return File writer instance */ static std::unique_ptr Make(const std::string& filename); + /*! * \brief Check filename existence * \param filename Filename of the data * \return True when the file exists */ static bool Exists(const std::string& filename); - - static size_t AlignedSize(size_t bytes, size_t alignment = 8) { - if (bytes % alignment == 0) { - return bytes; - } else { - return bytes / alignment * alignment + alignment; - } - } }; /** diff --git a/src/c_api.cpp b/src/c_api.cpp index 004a1f230c74..442247d7a9dd 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -951,6 +952,19 @@ int LGBM_SampleIndices(int32_t num_total_row, API_END(); } +int LGBM_ByteBufferGetAt(ByteBufferHandle handle, int32_t index, uint8_t* out_val) { + API_BEGIN(); + LightGBM::ByteBuffer* byteBuffer = reinterpret_cast(handle); + *out_val = byteBuffer->GetAt(index); + API_END(); +} + +int LGBM_ByteBufferFree(ByteBufferHandle handle) { + API_BEGIN(); + delete reinterpret_cast(handle); + API_END(); +} + int LGBM_DatasetCreateFromFile(const char* filename, const char* parameters, const DatasetHandle reference, @@ -1013,6 +1027,25 @@ int LGBM_DatasetCreateByReference(const DatasetHandle reference, API_END(); } +int LGBM_DatasetCreateFromSerializedReference(const void* ref_buffer, + int32_t ref_buffer_size, + int64_t num_row, + int32_t num_classes, + const char* parameters, + DatasetHandle* out) { + API_BEGIN(); + auto param = Config::Str2Map(parameters); + Config config; + config.Set(param); + OMP_SET_NUM_THREADS(config.num_threads); + DatasetLoader loader(config, nullptr, 1, nullptr); + *out = loader.LoadFromSerializedReference(static_cast(ref_buffer), + static_cast(ref_buffer_size), + static_cast(num_row), + num_classes); + API_END(); +} + int LGBM_DatasetInitStreaming(DatasetHandle dataset, int32_t has_weights, int32_t has_init_scores, @@ -1613,6 +1646,19 @@ int LGBM_DatasetSaveBinary(DatasetHandle handle, API_END(); } +int LGBM_DatasetSerializeReferenceToBinary(DatasetHandle handle, + ByteBufferHandle* out, + int32_t* out_len) { + API_BEGIN(); + auto dataset = reinterpret_cast(handle); + std::unique_ptr ret; + ret.reset(new LightGBM::ByteBuffer()); + dataset->SerializeReference(ret.get()); + *out_len = static_cast(ret->GetSize()); + *out = ret.release(); + API_END(); +} + int LGBM_DatasetDumpText(DatasetHandle handle, const char* filename) { API_BEGIN(); diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 652b874c94d8..30da15d81053 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -577,7 +577,7 @@ namespace LightGBM { } } - void BinMapper::SaveBinaryToFile(const VirtualFileWriter* writer) const { + void BinMapper::SaveBinaryToFile(BinaryWriter* writer) const { writer->AlignedWrite(&num_bin_, sizeof(num_bin_)); writer->AlignedWrite(&missing_type_, sizeof(missing_type_)); writer->AlignedWrite(&is_trivial_, sizeof(is_trivial_)); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index de368d3036db..a8f449d3f55b 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -19,8 +19,13 @@ namespace LightGBM { +const int Dataset::kSerializedReferenceVersionLength = 2; +const char* Dataset::serialized_reference_version = "v1"; + const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n"; +const char* Dataset::binary_serialized_reference_token = + "______LightGBM_Binary_Serialized_Token______\n"; Dataset::Dataset() { data_filename_ = "noname"; @@ -994,80 +999,9 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { Log::Info("Saving data to binary file %s", bin_filename); size_t size_of_token = std::strlen(binary_file_token); writer->AlignedWrite(binary_file_token, size_of_token); - // get size of header - size_t size_of_header = - VirtualFileWriter::AlignedSize(sizeof(num_data_)) + - VirtualFileWriter::AlignedSize(sizeof(num_features_)) + - VirtualFileWriter::AlignedSize(sizeof(num_total_features_)) + - VirtualFileWriter::AlignedSize(sizeof(int) * num_total_features_) + - VirtualFileWriter::AlignedSize(sizeof(label_idx_)) + - VirtualFileWriter::AlignedSize(sizeof(num_groups_)) + - 3 * VirtualFileWriter::AlignedSize(sizeof(int) * num_features_) + - sizeof(uint64_t) * (num_groups_ + 1) + - 2 * VirtualFileWriter::AlignedSize(sizeof(int) * num_groups_) + - VirtualFileWriter::AlignedSize(sizeof(int32_t) * num_total_features_) + - VirtualFileWriter::AlignedSize(sizeof(int)) * 3 + - VirtualFileWriter::AlignedSize(sizeof(bool)) * 3; - // size of feature names - for (int i = 0; i < num_total_features_; ++i) { - size_of_header += - VirtualFileWriter::AlignedSize(feature_names_[i].size()) + - VirtualFileWriter::AlignedSize(sizeof(int)); - } - // size of forced bins - for (int i = 0; i < num_total_features_; ++i) { - size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + - VirtualFileWriter::AlignedSize(sizeof(int)); - } - writer->Write(&size_of_header, sizeof(size_of_header)); - // write header - writer->AlignedWrite(&num_data_, sizeof(num_data_)); - writer->AlignedWrite(&num_features_, sizeof(num_features_)); - writer->AlignedWrite(&num_total_features_, sizeof(num_total_features_)); - writer->AlignedWrite(&label_idx_, sizeof(label_idx_)); - writer->AlignedWrite(&max_bin_, sizeof(max_bin_)); - writer->AlignedWrite(&bin_construct_sample_cnt_, - sizeof(bin_construct_sample_cnt_)); - writer->AlignedWrite(&min_data_in_bin_, sizeof(min_data_in_bin_)); - writer->AlignedWrite(&use_missing_, sizeof(use_missing_)); - writer->AlignedWrite(&zero_as_missing_, sizeof(zero_as_missing_)); - writer->AlignedWrite(&has_raw_, sizeof(has_raw_)); - writer->AlignedWrite(used_feature_map_.data(), - sizeof(int) * num_total_features_); - writer->AlignedWrite(&num_groups_, sizeof(num_groups_)); - writer->AlignedWrite(real_feature_idx_.data(), sizeof(int) * num_features_); - writer->AlignedWrite(feature2group_.data(), sizeof(int) * num_features_); - writer->AlignedWrite(feature2subfeature_.data(), - sizeof(int) * num_features_); - writer->Write(group_bin_boundaries_.data(), - sizeof(uint64_t) * (num_groups_ + 1)); - writer->AlignedWrite(group_feature_start_.data(), - sizeof(int) * num_groups_); - writer->AlignedWrite(group_feature_cnt_.data(), sizeof(int) * num_groups_); - if (max_bin_by_feature_.empty()) { - ArrayArgs::Assign(&max_bin_by_feature_, -1, num_total_features_); - } - writer->AlignedWrite(max_bin_by_feature_.data(), - sizeof(int32_t) * num_total_features_); - if (ArrayArgs::CheckAll(max_bin_by_feature_, -1)) { - max_bin_by_feature_.clear(); - } - // write feature names - for (int i = 0; i < num_total_features_; ++i) { - int str_len = static_cast(feature_names_[i].size()); - writer->AlignedWrite(&str_len, sizeof(int)); - const char* c_str = feature_names_[i].c_str(); - writer->AlignedWrite(c_str, sizeof(char) * str_len); - } - // write forced bins - for (int i = 0; i < num_total_features_; ++i) { - int num_bounds = static_cast(forced_bin_bounds_[i].size()); - writer->AlignedWrite(&num_bounds, sizeof(int)); - for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) { - writer->Write(&forced_bin_bounds_[i][j], sizeof(double)); - } - } + // Write the basic header information for the dataset + SerializeHeader(writer.get()); // get size of meta data size_t size_of_metadata = metadata_.SizesInByte(); @@ -1081,7 +1015,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { size_t size_of_feature = feature_groups_[i]->SizesInByte(); writer->Write(&size_of_feature, sizeof(size_of_feature)); // write feature - feature_groups_[i]->SaveBinaryToFile(writer.get()); + feature_groups_[i]->SerializeToBinary(writer.get()); } // write raw data; use row-major order so we can read row-by-row @@ -1098,6 +1032,117 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { } } +void Dataset::SerializeReference(ByteBuffer* buffer) { + Log::Info("Saving data reference to binary buffer"); + + // Calculate approximate size of output and reserve space + size_t size_of_token = std::strlen(binary_serialized_reference_token); + size_t initial_capacity = size_of_token + GetSerializedHeaderSize(); + // write feature group definitions + for (int i = 0; i < num_groups_; ++i) { + initial_capacity += feature_groups_[i]->SizesInByte(/* include_data */ false); + } + + // Give a little extra just in case, to avoid unnecessary resizes + buffer->Reserve(static_cast(1.1 * static_cast(initial_capacity))); + + // Write token that marks the data as binary reference, and the version + buffer->AlignedWrite(binary_serialized_reference_token, size_of_token); + buffer->AlignedWrite(serialized_reference_version, kSerializedReferenceVersionLength); + + // Write the basic definition of the overall dataset + SerializeHeader(buffer); + + // write feature group definitions + for (int i = 0; i < num_groups_; ++i) { + // get size of feature + size_t size_of_feature = feature_groups_[i]->SizesInByte(false); + buffer->Write(&size_of_feature, sizeof(size_of_feature)); + // write feature + feature_groups_[i]->SerializeToBinary(buffer, /* include_data */ false); + } +} + +size_t Dataset::GetSerializedHeaderSize() { + size_t size_of_header = + VirtualFileWriter::AlignedSize(sizeof(num_data_)) + + VirtualFileWriter::AlignedSize(sizeof(num_features_)) + + VirtualFileWriter::AlignedSize(sizeof(num_total_features_)) + + VirtualFileWriter::AlignedSize(sizeof(int) * num_total_features_) + + VirtualFileWriter::AlignedSize(sizeof(label_idx_)) + + VirtualFileWriter::AlignedSize(sizeof(num_groups_)) + + 3 * VirtualFileWriter::AlignedSize(sizeof(int) * num_features_) + + sizeof(uint64_t) * (num_groups_ + 1) + + 2 * VirtualFileWriter::AlignedSize(sizeof(int) * num_groups_) + + VirtualFileWriter::AlignedSize(sizeof(int32_t) * num_total_features_) + + VirtualFileWriter::AlignedSize(sizeof(int)) * 3 + + VirtualFileWriter::AlignedSize(sizeof(bool)) * 3; + // size of feature names and forced bins + for (int i = 0; i < num_total_features_; ++i) { + size_of_header += + VirtualFileWriter::AlignedSize(feature_names_[i].size()) + + VirtualFileWriter::AlignedSize(sizeof(int)) + + forced_bin_bounds_[i].size() * sizeof(double) + + VirtualFileWriter::AlignedSize(sizeof(int)); + } + + return size_of_header; +} + +void Dataset::SerializeHeader(BinaryWriter* writer) { + size_t size_of_header = GetSerializedHeaderSize(); + writer->Write(&size_of_header, sizeof(size_of_header)); + + // write header + writer->AlignedWrite(&num_data_, sizeof(num_data_)); + writer->AlignedWrite(&num_features_, sizeof(num_features_)); + writer->AlignedWrite(&num_total_features_, sizeof(num_total_features_)); + writer->AlignedWrite(&label_idx_, sizeof(label_idx_)); + writer->AlignedWrite(&max_bin_, sizeof(max_bin_)); + writer->AlignedWrite(&bin_construct_sample_cnt_, + sizeof(bin_construct_sample_cnt_)); + writer->AlignedWrite(&min_data_in_bin_, sizeof(min_data_in_bin_)); + writer->AlignedWrite(&use_missing_, sizeof(use_missing_)); + writer->AlignedWrite(&zero_as_missing_, sizeof(zero_as_missing_)); + writer->AlignedWrite(&has_raw_, sizeof(has_raw_)); + writer->AlignedWrite(used_feature_map_.data(), + sizeof(int) * num_total_features_); + writer->AlignedWrite(&num_groups_, sizeof(num_groups_)); + writer->AlignedWrite(real_feature_idx_.data(), sizeof(int) * num_features_); + writer->AlignedWrite(feature2group_.data(), sizeof(int) * num_features_); + writer->AlignedWrite(feature2subfeature_.data(), + sizeof(int) * num_features_); + writer->Write(group_bin_boundaries_.data(), + sizeof(uint64_t) * (num_groups_ + 1)); + writer->AlignedWrite(group_feature_start_.data(), + sizeof(int) * num_groups_); + writer->AlignedWrite(group_feature_cnt_.data(), sizeof(int) * num_groups_); + if (max_bin_by_feature_.empty()) { + ArrayArgs::Assign(&max_bin_by_feature_, -1, num_total_features_); + } + writer->AlignedWrite(max_bin_by_feature_.data(), + sizeof(int32_t) * num_total_features_); + if (ArrayArgs::CheckAll(max_bin_by_feature_, -1)) { + max_bin_by_feature_.clear(); + } + // write feature names + for (int i = 0; i < num_total_features_; ++i) { + int str_len = static_cast(feature_names_[i].size()); + writer->AlignedWrite(&str_len, sizeof(int)); + const char* c_str = feature_names_[i].c_str(); + writer->AlignedWrite(c_str, sizeof(char) * str_len); + } + // write forced bins + for (int i = 0; i < num_total_features_; ++i) { + int num_bounds = static_cast(forced_bin_bounds_[i].size()); + writer->AlignedWrite(&num_bounds, sizeof(int)); + + for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) { + writer->Write(&forced_bin_bounds_[i][j], sizeof(double)); + } + } +} + void Dataset::DumpTextFile(const char* text_filename) { FILE* file = NULL; #if _MSC_VER diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 621176db2c59..c0aa5e9d4eaf 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -353,6 +353,67 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, return dataset.release(); } +Dataset* DatasetLoader::LoadFromSerializedReference(const char* binary_data, size_t buffer_size, data_size_t num_data, int32_t num_classes) { + auto dataset = std::unique_ptr(new Dataset(num_data)); + + auto mem_ptr = binary_data; + + // check token + const size_t size_of_token = std::strlen(Dataset::binary_serialized_reference_token); + size_t size_of_token_in_input = VirtualFileWriter::AlignedSize(sizeof(char) * size_of_token); + if (buffer_size < size_of_token_in_input) { + Log::Fatal("Binary definition file error: token has the wrong size"); + } + if (std::string(mem_ptr, size_of_token) != std::string(Dataset::binary_serialized_reference_token)) { + Log::Fatal("Input file is not LightGBM binary reference file"); + } + mem_ptr += size_of_token_in_input; + + size_t size_of_version = VirtualFileWriter::AlignedSize(Dataset::kSerializedReferenceVersionLength); + std::string version(mem_ptr, Dataset::kSerializedReferenceVersionLength); + if (version != std::string(Dataset::serialized_reference_version)) { + Log::Fatal("Unexpected version of serialized binary data: %s", version.c_str()); + } + mem_ptr += size_of_version; + + size_t size_of_header = *(reinterpret_cast(mem_ptr)); + mem_ptr += sizeof(size_t); + + LoadHeaderFromMemory(dataset.get(), mem_ptr); + dataset->num_data_ = num_data; // update to the given num_data + mem_ptr += size_of_header; + + // read feature group definitions + for (int i = 0; i < dataset->num_groups_; ++i) { + // read feature size + const size_t size_of_feature = *(reinterpret_cast(mem_ptr)); + mem_ptr += sizeof(size_t); + dataset->feature_groups_.emplace_back(std::unique_ptr(new FeatureGroup(mem_ptr, num_data, i))); + mem_ptr += size_of_feature; + } + dataset->feature_groups_.shrink_to_fit(); + + dataset->numeric_feature_map_ = std::vector(dataset->num_features_, false); + dataset->num_numeric_features_ = 0; + for (int i = 0; i < dataset->num_features_; ++i) { + if (dataset->FeatureBinMapper(i)->bin_type() == BinType::CategoricalBin) { + dataset->numeric_feature_map_[i] = -1; + } else { + dataset->numeric_feature_map_[i] = dataset->num_numeric_features_; + ++dataset->num_numeric_features_; + } + } + + int has_weights = config_.weight_column.size() > 0; + int has_init_scores = num_classes > 0; + int has_queries = config_.group_column.size() > 0; + dataset->metadata_.Init(num_data, has_weights, has_init_scores, has_queries, num_classes); + + Log::Info("Loaded reference dataset: %d features, %d num_data", dataset->num_features_, num_data); + + return dataset.release(); +} + Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices) { @@ -388,7 +449,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b size_t size_of_head = *(reinterpret_cast(buffer.data())); - // re-allocmate space if not enough + // re-allocate space if not enough if (size_of_head > buffer_size) { buffer_size = size_of_head; buffer.resize(buffer_size); @@ -401,135 +462,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b } // get header const char* mem_ptr = buffer.data(); - dataset->num_data_ = *(reinterpret_cast(mem_ptr)); - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_data_)); - dataset->num_features_ = *(reinterpret_cast(mem_ptr)); - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_features_)); - dataset->num_total_features_ = *(reinterpret_cast(mem_ptr)); - mem_ptr += - VirtualFileWriter::AlignedSize(sizeof(dataset->num_total_features_)); - dataset->label_idx_ = *(reinterpret_cast(mem_ptr)); - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->label_idx_)); - dataset->max_bin_ = *(reinterpret_cast(mem_ptr)); - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->max_bin_)); - dataset->bin_construct_sample_cnt_ = *(reinterpret_cast(mem_ptr)); - mem_ptr += VirtualFileWriter::AlignedSize( - sizeof(dataset->bin_construct_sample_cnt_)); - dataset->min_data_in_bin_ = *(reinterpret_cast(mem_ptr)); - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->min_data_in_bin_)); - dataset->use_missing_ = *(reinterpret_cast(mem_ptr)); - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->use_missing_)); - dataset->zero_as_missing_ = *(reinterpret_cast(mem_ptr)); - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->zero_as_missing_)); - dataset->has_raw_ = *(reinterpret_cast(mem_ptr)); - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->has_raw_)); - const int* tmp_feature_map = reinterpret_cast(mem_ptr); - dataset->used_feature_map_.clear(); - for (int i = 0; i < dataset->num_total_features_; ++i) { - dataset->used_feature_map_.push_back(tmp_feature_map[i]); - } - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * - dataset->num_total_features_); - // num_groups - dataset->num_groups_ = *(reinterpret_cast(mem_ptr)); - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_groups_)); - // real_feature_idx_ - const int* tmp_ptr_real_feature_idx_ = reinterpret_cast(mem_ptr); - dataset->real_feature_idx_.clear(); - for (int i = 0; i < dataset->num_features_; ++i) { - dataset->real_feature_idx_.push_back(tmp_ptr_real_feature_idx_[i]); - } - mem_ptr += - VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_); - // feature2group - const int* tmp_ptr_feature2group = reinterpret_cast(mem_ptr); - dataset->feature2group_.clear(); - for (int i = 0; i < dataset->num_features_; ++i) { - dataset->feature2group_.push_back(tmp_ptr_feature2group[i]); - } - mem_ptr += - VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_); - // feature2subfeature - const int* tmp_ptr_feature2subfeature = reinterpret_cast(mem_ptr); - dataset->feature2subfeature_.clear(); - for (int i = 0; i < dataset->num_features_; ++i) { - dataset->feature2subfeature_.push_back(tmp_ptr_feature2subfeature[i]); - } - mem_ptr += - VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_); - // group_bin_boundaries - const uint64_t* tmp_ptr_group_bin_boundaries = reinterpret_cast(mem_ptr); - dataset->group_bin_boundaries_.clear(); - for (int i = 0; i < dataset->num_groups_ + 1; ++i) { - dataset->group_bin_boundaries_.push_back(tmp_ptr_group_bin_boundaries[i]); - } - mem_ptr += sizeof(uint64_t) * (dataset->num_groups_ + 1); - - // group_feature_start_ - const int* tmp_ptr_group_feature_start = reinterpret_cast(mem_ptr); - dataset->group_feature_start_.clear(); - for (int i = 0; i < dataset->num_groups_; ++i) { - dataset->group_feature_start_.push_back(tmp_ptr_group_feature_start[i]); - } - mem_ptr += - VirtualFileWriter::AlignedSize(sizeof(int) * (dataset->num_groups_)); - - // group_feature_cnt_ - const int* tmp_ptr_group_feature_cnt = reinterpret_cast(mem_ptr); - dataset->group_feature_cnt_.clear(); - for (int i = 0; i < dataset->num_groups_; ++i) { - dataset->group_feature_cnt_.push_back(tmp_ptr_group_feature_cnt[i]); - } - mem_ptr += - VirtualFileWriter::AlignedSize(sizeof(int) * (dataset->num_groups_)); - - if (!config_.max_bin_by_feature.empty()) { - CHECK_EQ(static_cast(dataset->num_total_features_), config_.max_bin_by_feature.size()); - CHECK_GT(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())), 1); - dataset->max_bin_by_feature_.resize(dataset->num_total_features_); - dataset->max_bin_by_feature_.assign(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end()); - } else { - const int32_t* tmp_ptr_max_bin_by_feature = reinterpret_cast(mem_ptr); - dataset->max_bin_by_feature_.clear(); - for (int i = 0; i < dataset->num_total_features_; ++i) { - dataset->max_bin_by_feature_.push_back(tmp_ptr_max_bin_by_feature[i]); - } - } - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int32_t) * - (dataset->num_total_features_)); - if (ArrayArgs::CheckAll(dataset->max_bin_by_feature_, -1)) { - dataset->max_bin_by_feature_.clear(); - } - - // get feature names - dataset->feature_names_.clear(); - // write feature names - for (int i = 0; i < dataset->num_total_features_; ++i) { - int str_len = *(reinterpret_cast(mem_ptr)); - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int)); - std::stringstream str_buf; - auto tmp_arr = reinterpret_cast(mem_ptr); - for (int j = 0; j < str_len; ++j) { - char tmp_char = tmp_arr[j]; - str_buf << tmp_char; - } - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(char) * str_len); - dataset->feature_names_.emplace_back(str_buf.str()); - } - // get forced_bin_bounds_ - dataset->forced_bin_bounds_ = std::vector>(dataset->num_total_features_, std::vector()); - for (int i = 0; i < dataset->num_total_features_; ++i) { - int num_bounds = *(reinterpret_cast(mem_ptr)); - mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int)); - dataset->forced_bin_bounds_[i] = std::vector(); - const double* tmp_ptr_forced_bounds = - reinterpret_cast(mem_ptr); - for (int j = 0; j < num_bounds; ++j) { - double bound = tmp_ptr_forced_bounds[j]; - dataset->forced_bin_bounds_[i].push_back(bound); - } - mem_ptr += num_bounds * sizeof(double); - } + LoadHeaderFromMemory(dataset.get(), mem_ptr); // read size of meta data read_cnt = reader->Read(buffer.data(), sizeof(size_t)); @@ -821,6 +754,131 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values, // ---- private functions ---- +void DatasetLoader::LoadHeaderFromMemory(Dataset* dataset, const char* buffer) { + // get header + const char* mem_ptr = buffer; + dataset->num_data_ = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_data_)); + dataset->num_features_ = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_features_)); + dataset->num_total_features_ = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_total_features_)); + dataset->label_idx_ = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->label_idx_)); + dataset->max_bin_ = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->max_bin_)); + dataset->bin_construct_sample_cnt_ = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->bin_construct_sample_cnt_)); + dataset->min_data_in_bin_ = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->min_data_in_bin_)); + dataset->use_missing_ = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->use_missing_)); + dataset->zero_as_missing_ = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->zero_as_missing_)); + dataset->has_raw_ = *(reinterpret_cast(mem_ptr)); + + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->has_raw_)); + const int* tmp_feature_map = reinterpret_cast(mem_ptr); + dataset->used_feature_map_.clear(); + for (int i = 0; i < dataset->num_total_features_; ++i) { + dataset->used_feature_map_.push_back(tmp_feature_map[i]); + } + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_total_features_); + // num_groups + dataset->num_groups_ = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_groups_)); + // real_feature_idx_ + const int* tmp_ptr_real_feature_idx_ = reinterpret_cast(mem_ptr); + dataset->real_feature_idx_.clear(); + for (int i = 0; i < dataset->num_features_; ++i) { + dataset->real_feature_idx_.push_back(tmp_ptr_real_feature_idx_[i]); + } + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_); + // feature2group + const int* tmp_ptr_feature2group = reinterpret_cast(mem_ptr); + dataset->feature2group_.clear(); + for (int i = 0; i < dataset->num_features_; ++i) { + dataset->feature2group_.push_back(tmp_ptr_feature2group[i]); + } + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_); + // feature2subfeature + const int* tmp_ptr_feature2subfeature = reinterpret_cast(mem_ptr); + dataset->feature2subfeature_.clear(); + for (int i = 0; i < dataset->num_features_; ++i) { + dataset->feature2subfeature_.push_back(tmp_ptr_feature2subfeature[i]); + } + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_); + // group_bin_boundaries + const uint64_t* tmp_ptr_group_bin_boundaries = reinterpret_cast(mem_ptr); + dataset->group_bin_boundaries_.clear(); + for (int i = 0; i < dataset->num_groups_ + 1; ++i) { + dataset->group_bin_boundaries_.push_back(tmp_ptr_group_bin_boundaries[i]); + } + mem_ptr += sizeof(uint64_t) * (dataset->num_groups_ + 1); + + // group_feature_start_ + const int* tmp_ptr_group_feature_start = reinterpret_cast(mem_ptr); + dataset->group_feature_start_.clear(); + for (int i = 0; i < dataset->num_groups_; ++i) { + dataset->group_feature_start_.push_back(tmp_ptr_group_feature_start[i]); + } + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * (dataset->num_groups_)); + + // group_feature_cnt_ + const int* tmp_ptr_group_feature_cnt = reinterpret_cast(mem_ptr); + dataset->group_feature_cnt_.clear(); + for (int i = 0; i < dataset->num_groups_; ++i) { + dataset->group_feature_cnt_.push_back(tmp_ptr_group_feature_cnt[i]); + } + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * (dataset->num_groups_)); + + if (!config_.max_bin_by_feature.empty()) { + CHECK_EQ(static_cast(dataset->num_total_features_), config_.max_bin_by_feature.size()); + CHECK_GT(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())), 1); + dataset->max_bin_by_feature_.resize(dataset->num_total_features_); + dataset->max_bin_by_feature_.assign(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end()); + } else { + const int32_t* tmp_ptr_max_bin_by_feature = reinterpret_cast(mem_ptr); + dataset->max_bin_by_feature_.clear(); + for (int i = 0; i < dataset->num_total_features_; ++i) { + dataset->max_bin_by_feature_.push_back(tmp_ptr_max_bin_by_feature[i]); + } + } + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int32_t) * (dataset->num_total_features_)); + if (ArrayArgs::CheckAll(dataset->max_bin_by_feature_, -1)) { + dataset->max_bin_by_feature_.clear(); + } + + // get feature names + dataset->feature_names_.clear(); + for (int i = 0; i < dataset->num_total_features_; ++i) { + int str_len = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int)); + std::stringstream str_buf; + auto tmp_arr = reinterpret_cast(mem_ptr); + for (int j = 0; j < str_len; ++j) { + char tmp_char = tmp_arr[j]; + str_buf << tmp_char; + } + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(char) * str_len); + dataset->feature_names_.emplace_back(str_buf.str()); + } + // get forced_bin_bounds_ + dataset->forced_bin_bounds_ = std::vector>(dataset->num_total_features_, std::vector()); + for (int i = 0; i < dataset->num_total_features_; ++i) { + int num_bounds = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int)); + dataset->forced_bin_bounds_[i] = std::vector(); + const double* tmp_ptr_forced_bounds = + reinterpret_cast(mem_ptr); + for (int j = 0; j < num_bounds; ++j) { + double bound = tmp_ptr_forced_bounds[j]; + dataset->forced_bin_bounds_[i].push_back(bound); + } + mem_ptr += num_bounds * sizeof(double); + } +} + void DatasetLoader::CheckDataset(const Dataset* dataset, bool is_load_from_binary) { if (dataset->num_data_ <= 0) { Log::Fatal("Data file %s is empty", dataset->data_filename_.c_str()); diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 0ebcdc1a6181..3d0f8db8e549 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -451,7 +451,7 @@ class DenseBin : public Bin { } } - void SaveBinaryToFile(const VirtualFileWriter* writer) const override { + void SaveBinaryToFile(BinaryWriter* writer) const override { writer->AlignedWrite(data_.data(), sizeof(VAL_T) * data_.size()); } diff --git a/src/io/file_io.cpp b/src/io/file_io.cpp index a205964287e9..a2721e96c2dd 100644 --- a/src/io/file_io.cpp +++ b/src/io/file_io.cpp @@ -46,7 +46,7 @@ struct LocalFile : VirtualFileReader, VirtualFileWriter { return fread(buffer, 1, bytes, file_); } - size_t Write(const void* buffer, size_t bytes) const { + size_t Write(const void* buffer, size_t bytes) { return fwrite(buffer, bytes, 1, file_) == 1 ? bytes : 0; } diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 07e9701b1ca6..2a589fa24ef8 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -675,7 +675,7 @@ void Metadata::LoadFromMemory(const void* memory) { CalculateQueryWeights(); } -void Metadata::SaveBinaryToFile(const VirtualFileWriter* writer) const { +void Metadata::SaveBinaryToFile(BinaryWriter* writer) const { writer->AlignedWrite(&num_data_, sizeof(num_data_)); writer->AlignedWrite(&num_weights_, sizeof(num_weights_)); writer->AlignedWrite(&num_queries_, sizeof(num_queries_)); diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index 79ebb25d08dd..e01c0afcf5bc 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -508,7 +508,7 @@ class SparseBin : public Bin { fast_index_.shrink_to_fit(); } - void SaveBinaryToFile(const VirtualFileWriter* writer) const override { + void SaveBinaryToFile(BinaryWriter* writer) const override { writer->AlignedWrite(&num_vals_, sizeof(num_vals_)); writer->AlignedWrite(deltas_.data(), sizeof(uint8_t) * (num_vals_ + 1)); writer->AlignedWrite(vals_.data(), sizeof(VAL_T) * num_vals_); diff --git a/tests/cpp_tests/test_byte_buffer.cpp b/tests/cpp_tests/test_byte_buffer.cpp new file mode 100644 index 000000000000..98df661ddd31 --- /dev/null +++ b/tests/cpp_tests/test_byte_buffer.cpp @@ -0,0 +1,71 @@ +/*! + * Copyright (c) 2022 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include +#include + +#include + +using LightGBM::ByteBuffer; + + +TEST(ByteBuffer, JustWorks) { + std::unique_ptr buffer; + buffer.reset(new ByteBuffer()); + + int cumulativeSize = 0; + EXPECT_EQ(cumulativeSize, buffer->GetSize()); + + int8_t int8Val = 34; + cumulativeSize += sizeof(int8_t); + buffer->Write(&int8Val, sizeof(int8_t)); + EXPECT_EQ(cumulativeSize, buffer->GetSize()); + EXPECT_EQ(int8Val, buffer->GetAt(cumulativeSize - 1)); + + int16_t int16Val = 33; + cumulativeSize += sizeof(int16_t); + buffer->Write(&int16Val, sizeof(int16_t)); + EXPECT_EQ(cumulativeSize, buffer->GetSize()); + int16_t serializedInt16 = 0; + char* int16Ptr = reinterpret_cast(&serializedInt16); + for (int i = 0; i < sizeof(int16_t); i++) { + int16Ptr[i] = buffer->GetAt(cumulativeSize - (sizeof(int16_t) - i)); + } + EXPECT_EQ(int16Val, serializedInt16); + + int64_t int64Val = 35; + cumulativeSize += sizeof(int64_t); + buffer->Write(&int64Val, sizeof(int64_t)); + EXPECT_EQ(cumulativeSize, buffer->GetSize()); + int64_t serializedInt64 = 0; + char* int64Ptr = reinterpret_cast(&serializedInt64); + for (int i = 0; i < sizeof(int64_t); i++) { + int64Ptr[i] = buffer->GetAt(cumulativeSize - (sizeof(int64_t) - i)); + } + EXPECT_EQ(int64Val, serializedInt64); + + double doubleVal = 36.6; + cumulativeSize += sizeof(double); + buffer->Write(&doubleVal, sizeof(doubleVal)); + EXPECT_EQ(cumulativeSize, buffer->GetSize()); + double serializedDouble = 0; + char* doublePtr = reinterpret_cast(&serializedDouble); + for (int i = 0; i < sizeof(double); i++) { + doublePtr[i] = buffer->GetAt(cumulativeSize - (sizeof(double) - i)); + } + EXPECT_EQ(doubleVal, serializedDouble); + + const int charSize = 3; + char charArrayVal[charSize] = { 'a', 'b', 'c' }; + cumulativeSize += charSize; + buffer->Write(charArrayVal, charSize); + EXPECT_EQ(cumulativeSize, buffer->GetSize()); + for (int i = 0; i < charSize; i++) { + EXPECT_EQ(charArrayVal[i], buffer->GetAt(cumulativeSize - (charSize - i))); + } + + // Test that Data() points to first value written + EXPECT_EQ(int8Val, *buffer->Data()); +} diff --git a/tests/cpp_tests/test_serialize.cpp b/tests/cpp_tests/test_serialize.cpp new file mode 100644 index 000000000000..7038f0d6ce9f --- /dev/null +++ b/tests/cpp_tests/test_serialize.cpp @@ -0,0 +1,84 @@ +/*! + * Copyright (c) 2022 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include +#include +#include +#include +#include +#include + +#include + +using LightGBM::ByteBuffer; +using LightGBM::Dataset; +using LightGBM::Log; +using LightGBM::TestUtils; + +TEST(Serialization, JustWorks) { + // Load some test data + DatasetHandle dataset_handle; + const char* params = "max_bin=15"; + int result = TestUtils::LoadDatasetFromExamples("binary_classification/binary.test", params, &dataset_handle); + EXPECT_EQ(0, result) << "LoadDatasetFromExamples result code: " << result; + + Dataset* dataset; + bool succeeded = true; + std::string exceptionText(""); + try { + dataset = static_cast(dataset_handle); + + // Serialize the reference + ByteBufferHandle buffer_handle; + int32_t buffer_len; + result = LGBM_DatasetSerializeReferenceToBinary(dataset_handle, &buffer_handle, &buffer_len); + EXPECT_EQ(0, result) << "LGBM_DatasetSerializeReferenceToBinary result code: " << result; + + ByteBuffer* buffer = nullptr; + Dataset* deserialized_dataset = nullptr; + try { + buffer = static_cast(buffer_handle); + + // Deserialize the reference + DatasetHandle deserialized_dataset_handle; + result = LGBM_DatasetCreateFromSerializedReference(buffer->Data(), + static_cast(buffer->GetSize()), + dataset->num_data(), + 0, // num_classes + params, + &deserialized_dataset_handle); + EXPECT_EQ(0, result) << "LGBM_DatasetCreateFromSerializedReference result code: " << result; + + // Confirm 1 successful API call + deserialized_dataset = static_cast(deserialized_dataset_handle); + EXPECT_EQ(dataset->num_data(), deserialized_dataset->num_data()); + } catch (std::exception& ex) { + succeeded = false; + exceptionText = std::string(ex.what()); + } + + // Free memory + if (buffer) { + result = LGBM_ByteBufferFree(buffer); + EXPECT_EQ(0, result) << "LGBM_ByteBufferFree result code: " << result; + } + if (deserialized_dataset) { + result = LGBM_DatasetFree(deserialized_dataset); + EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result; + } + } catch (std::exception& ex) { + succeeded = false; + exceptionText = std::string(ex.what()); + } + + if (dataset) { + result = LGBM_DatasetFree(dataset); + EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result; + } + + if (!succeeded) { + FAIL() << "Test Serialization failed with exception: " << exceptionText; + } +} diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index 876ddda4cf64..342616d27daa 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -258,6 +258,8 @@ + + diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index 56b4e29287d5..ed591fc4d87a 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -231,6 +231,12 @@ src\treelearner + + include\LightGBM\utils + + + include\LightGBM\utils +