Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: Add serialization of reference dataset #5427

Merged
merged 10 commits into from
Feb 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/LightGBM/bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ class BinMapper {
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(const VirtualFileWriter* writer) const;
void SaveBinaryToFile(BinaryWriter* writer) const;

/*!
* \brief Mapping bin into feature value
Expand Down Expand Up @@ -286,7 +286,7 @@ class Bin {
* \brief Save binary data to file
* \param file File want to write
*/
virtual void SaveBinaryToFile(const VirtualFileWriter* writer) const = 0;
virtual void SaveBinaryToFile(BinaryWriter* writer) const = 0;

/*!
* \brief Load from memory
Expand Down
45 changes: 45 additions & 0 deletions include/LightGBM/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
typedef void* DatasetHandle; /*!< \brief Handle of dataset. */
typedef void* BoosterHandle; /*!< \brief Handle of booster. */
typedef void* FastConfigHandle; /*!< \brief Handle of FastConfig. */
typedef void* ByteBufferHandle; /*!< \brief Handle of ByteBuffer. */

#define C_API_DTYPE_FLOAT32 (0) /*!< \brief float32 (single precision float). */
#define C_API_DTYPE_FLOAT64 (1) /*!< \brief float64 (double precision float). */
Expand Down Expand Up @@ -96,6 +97,22 @@ LIGHTGBM_C_EXPORT int LGBM_SampleIndices(int32_t num_total_row,
void* out,
int32_t* out_len);

/*!
* \brief Get a ByteBuffer value at an index.
* \param handle Handle of byte buffer to be read
* \param index Index of value to return
* \param[out] out_val Byte value at index to return
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_ByteBufferGetAt(ByteBufferHandle handle, int32_t index, uint8_t* out_val);

/*!
* \brief Free space for byte buffer.
* \param handle Handle of byte buffer to be freed
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_ByteBufferFree(ByteBufferHandle handle);

/* --- start Dataset interface */

/*!
Expand Down Expand Up @@ -164,6 +181,23 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetInitStreaming(DatasetHandle dataset,
int32_t nthreads,
int32_t omp_max_threads);

/*!
* \brief Allocate the space for dataset and bucket feature bins according to serialized reference dataset.
* \param ref_buffer A binary representation of the dataset schema (feature groups, bins, etc.)
* \param ref_buffer_size The size of the reference array in bytes
* \param num_row Number of total rows the dataset will contain
* \param num_classes Number of classes (will be used only in case of multiclass and specifying initial scores)
* \param parameters Additional parameters
* \param[out] out Created dataset
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSerializedReference(const void* ref_buffer,
int32_t ref_buffer_size,
int64_t num_row,
int32_t num_classes,
const char* parameters,
DatasetHandle* out);

/*!
* \brief Push data to existing dataset, if ``nrow + start_row == num_total_row``, will call ``dataset->FinishLoad``.
* \param dataset Handle of dataset
Expand Down Expand Up @@ -464,6 +498,17 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetFree(DatasetHandle handle);
LIGHTGBM_C_EXPORT int LGBM_DatasetSaveBinary(DatasetHandle handle,
const char* filename);

/*!
* \brief Create a dataset schema representation as a binary byte array (excluding data).
* \param handle Handle of dataset
* \param[out] out The output byte array
* \param[out] out_len The length of the output byte array (returned for convenience)
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetSerializeReferenceToBinary(DatasetHandle handle,
ByteBufferHandle* out,
int32_t* out_len);

/*!
* \brief Save dataset to text file, intended for debugging use only.
* \param handle Handle of dataset
Expand Down
17 changes: 15 additions & 2 deletions include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <LightGBM/feature_group.h>
#include <LightGBM/meta.h>
#include <LightGBM/train_share_states.h>
#include <LightGBM/utils/byte_buffer.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/text_reader.h>
Expand Down Expand Up @@ -124,7 +125,7 @@ class Metadata {
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(const VirtualFileWriter* writer) const;
void SaveBinaryToFile(BinaryWriter* writer) const;

/*!
* \brief Get sizes in byte of this object
Expand Down Expand Up @@ -621,6 +622,11 @@ class Dataset {
*/
LIGHTGBM_EXPORT void SaveBinaryFile(const char* bin_filename);

/*!
* \brief Serialize the overall Dataset definition/schema to a binary buffer (i.e., without data)
*/
LIGHTGBM_EXPORT void SerializeReference(ByteBuffer* out);

LIGHTGBM_EXPORT void DumpTextFile(const char* text_filename);

LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset);
Expand Down Expand Up @@ -919,6 +925,10 @@ class Dataset {
#endif // USE_CUDA

private:
void SerializeHeader(BinaryWriter* serializer);

size_t GetSerializedHeaderSize();

void CreateCUDAColumnData();

std::string data_filename_;
Expand All @@ -938,8 +948,11 @@ class Dataset {
int label_idx_ = 0;
/*! \brief store feature names */
std::vector<std::string> feature_names_;
/*! \brief store feature names */
/*! \brief serialized versions */
static const int kSerializedReferenceVersionLength;
static const char* serialized_reference_version;
static const char* binary_file_token;
static const char* binary_serialized_reference_token;
int num_groups_;
std::vector<int> real_feature_idx_;
std::vector<int> feature2group_;
Expand Down
4 changes: 4 additions & 0 deletions include/LightGBM/dataset_loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class DatasetLoader {

LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);

LIGHTGBM_EXPORT Dataset* LoadFromSerializedReference(const char* buffer, size_t buffer_size, data_size_t num_data, int32_t num_classes);

LIGHTGBM_EXPORT Dataset* ConstructFromSampleData(double** sample_values,
int** sample_indices,
int num_col,
Expand All @@ -45,6 +47,8 @@ class DatasetLoader {
const std::unordered_set<int>& categorical_features);

private:
void LoadHeaderFromMemory(Dataset* dataset, const char* buffer);

Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

void SetHeader(const char* filename);
Expand Down
110 changes: 77 additions & 33 deletions include/LightGBM/feature_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,56 @@ class FeatureGroup {
}

/*!
* \brief Constructor from memory
* \brief Constructor from memory when data is present
* \param memory Pointer of memory
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
* \param group_id Id of group
*/
FeatureGroup(const void* memory, data_size_t num_all_data,
FeatureGroup(const void* memory,
data_size_t num_all_data,
const std::vector<data_size_t>& local_used_indices,
int group_id) {
// Load the definition schema first
const char* memory_ptr = LoadDefinitionFromMemory(memory, group_id);

// Allocate memory for the data
data_size_t num_data = num_all_data;
if (!local_used_indices.empty()) {
num_data = static_cast<data_size_t>(local_used_indices.size());
}
AllocateBins(num_data);

// Now load the actual data
if (is_multi_val_) {
for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_[i]->LoadFromMemory(memory_ptr, local_used_indices);
memory_ptr += multi_bin_data_[i]->SizesInByte();
}
} else {
bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
}
}

/*!
* \brief Constructor from definition in memory (without data)
* \param memory Pointer of memory
* \param local_used_indices Local used indices, empty means using all data
*/
FeatureGroup(const void* memory, data_size_t num_data, int group_id) {
LoadDefinitionFromMemory(memory, group_id);
AllocateBins(num_data);
}

/*! \brief Destructor */
~FeatureGroup() {}

/*!
* \brief Load the overall definition of the feature group from binary serialized data
* \param memory Pointer of memory
* \param group_id Id of group
*/
const char* LoadDefinitionFromMemory(const void* memory, int group_id) {
const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get is_sparse
is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
Expand All @@ -128,9 +170,9 @@ class FeatureGroup {
memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_));
num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
// get bin mapper
bin_mappers_.clear();

// get bin mapper(s)
bin_mappers_.clear();
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(new BinMapper(memory_ptr));
memory_ptr += bin_mappers_[i]->SizesInByte();
Expand Down Expand Up @@ -158,37 +200,33 @@ class FeatureGroup {
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
}
data_size_t num_data = num_all_data;
if (!local_used_indices.empty()) {
num_data = static_cast<data_size_t>(local_used_indices.size());
}

return memory_ptr;
}

/*!
* \brief Allocate the bins
* \param num_all_data Number of global data
*/
inline void AllocateBins(data_size_t num_data) {
if (is_multi_val_) {
for (int i = 0; i < num_feature_; ++i) {
int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
multi_bin_data_.emplace_back(Bin::CreateSparseBin(
num_data, bin_mappers_[i]->num_bin() + addi));
multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
} else {
multi_bin_data_.emplace_back(
Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
}
multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
memory_ptr += multi_bin_data_.back()->SizesInByte();
}
} else {
if (is_sparse_) {
bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
} else {
bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
}
// get bin data
bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
}
}

/*! \brief Destructor */
~FeatureGroup() {}

/*!
* \brief Initialize for pushing in a streaming fashion. By default, no action needed.
* \param num_thread The number of external threads that will be calling the push APIs
Expand Down Expand Up @@ -414,42 +452,48 @@ class FeatureGroup {
}

/*!
* \brief Save binary data to file
* \param file File want to write
* \brief Write to binary stream
* \param writer Writer
* \param include_data Whether to write data (true) or just header information (false)
*/
void SaveBinaryToFile(const VirtualFileWriter* writer) const {
void SerializeToBinary(BinaryWriter* writer, bool include_data = true) const {
writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
writer->AlignedWrite(&is_dense_multi_val_, sizeof(is_dense_multi_val_));
writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
writer->AlignedWrite(&num_feature_, sizeof(num_feature_));
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_[i]->SaveBinaryToFile(writer);
}
if (is_multi_val_) {
for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_[i]->SaveBinaryToFile(writer);

if (include_data) {
if (is_multi_val_) {
for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_[i]->SaveBinaryToFile(writer);
}
} else {
bin_data_->SaveBinaryToFile(writer);
}
} else {
bin_data_->SaveBinaryToFile(writer);
}
}

/*!
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const {
size_t SizesInByte(bool include_data = true) const {
size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_)) +
VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
VirtualFileWriter::AlignedSize(sizeof(num_feature_));
for (int i = 0; i < num_feature_; ++i) {
ret += bin_mappers_[i]->SizesInByte();
}
if (!is_multi_val_) {
ret += bin_data_->SizesInByte();
} else {
for (int i = 0; i < num_feature_; ++i) {
ret += multi_bin_data_[i]->SizesInByte();
if (include_data) {
if (!is_multi_val_) {
ret += bin_data_->SizesInByte();
} else {
for (int i = 0; i < num_feature_; ++i) {
ret += multi_bin_data_[i]->SizesInByte();
}
}
}
return ret;
Expand Down
Loading