Basic Single Header statistics and ml libray for C++ - Scikit-Learn like implementation
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
2
down vote
favorite
I am developing scikit-learn like implementation for C++ it is in the initial stage while developing I've started doubt myself that is this the correct implementation, since here accuracy is more important than robustness. I am still learning C++ and I would like an expert consultation since the development is at beginning stage I could fix and prevent future errors. Thank you
#include<iostream>
#include<vector>
#include<map>
#include<algorithm>
#include<typeinfo>
#include<cmath>
//===================================================================
// FOR COMPUTING MEAN
//===================================================================
template<typename T>
class mean
public:
T get_mean(std::vector<T> vec)
T total = 0;
for (auto i : vec) total += i;
auto average = total / vec.size();
return average;
;
//===================================================================
// FOR COMPUTING MEDIAN
//===================================================================
template<typename T>
class median
public:
T get_median(std::vector<T> vec, bool sorted = false)
if (sorted == false)std::sort(vec.begin(), vec.end());
auto vector_size = vec.size();
if (vector_size == 1) return vec[0];
// If the elements are odd return the middle element
if (vector_size % 2 != 0) return vec[vector_size / 2];
// If the elements count are even return the average of middle elements
auto middle_element_one = vector_size / 2;
auto middle_element_two = middle_element_one - 1;
auto result = (vec[middle_element_one] + vec[middle_element_two]) / 2;
return result;
;
//====================================================================
// FOR COMPUTING THE MODE
//====================================================================
template<typename T>
class mode
public:
T get_mode(std::vector<T> vec)
std::sort(vec.begin(), vec.end());
std::map<T, unsigned long int> number_count_table;
std::vector<T> elements_with_max_occurences;
unsigned long int bigger_no = 1;
for (auto i : vec)
if (number_count_table.find(i) == number_count_table.end()) number_count_table[i] = 1;
else
auto current_count = number_count_table[i];
current_count++;
if (current_count>bigger_no) bigger_no = current_count;
number_count_table[i] = current_count;
if (bigger_no == 1)
return vec[0];
else
for (auto itr : number_count_table)
if (itr.second == bigger_no) elements_with_max_occurences.push_back(itr.first);
std::sort(elements_with_max_occurences.begin(), elements_with_max_occurences.end());
return elements_with_max_occurences[0];
;
//========================================================================================
// FOR COMPUTING WEIGHTED MEAN
//========================================================================================
template <typename T>
class weighted_mean
private:
unsigned long int vector_size;
public:
T get_weighted_mean(std::vector<T> vec, std::vector<T> weights)
this->vector_size = vec.size();
T numerator = 0;
T total_weight = 0;
for (unsigned long int i = 0; i<vector_size; i++)
T current_value = vec[i] * weights[i];
numerator += current_value;
total_weight += weights[i];
//std::cout << "NUMERATOR: " << numerator << "n";
//std::cout << "DENOMINATOR: " << summation_of_weights << "n";
return numerator / total_weight;
;
//==========================================================================================
// FOR COMPUTING STANDARD DEVIATION
//==========================================================================================
template <typename T>
class standard_deviation : public mean<T>
private:
T mean_value;
T standard_deviation_value;
public:
T get_standard_deviation(std::vector<T> vec)
this->mean_value = this->get_mean(vec);
this->standard_deviation_value = 0;
for (unsigned long int i = 0; i<vec.size(); ++i)
T powered_value = (vec[i] - this->mean_value) * (vec[i] - this->mean_value);
this->standard_deviation_value += powered_value;
this->standard_deviation_value = sqrt(this->standard_deviation_value / vec.size());
return this->standard_deviation_value;
;
//==========================================================================================
// FOR COMPUTING INTERQUARTILE RANGE
//==========================================================================================
/*
CALCULATING INTERQUARTILE RANGE FOR ODD NO OF ELEMENTS:
-------------------------------------------------------
Given Elements(E) = 3,5,5,7,8,8,9,10,13,13,14,15,16,22,23
Step 1: Sort it! Here we already have an sorted values
E = 3, 5, 5, 7, 8, 8, 9, 10, 13, 13, 14, 15, 16, 22, 23
Step 2: Get the median for the values
Median (M) = 10
Step 3: The lower and the upper half
Elements which are left to the median are called left half and
to the right is right half
3 5 5 7 8 8 9 10 13 13 14 15 16 22 23
|___________| | |__________________|
LOWER HALF MEDIAN UPPER HALF
Step 4:
Q1 = median of left half
Q3 = median of right half
interquartile_range = Q3 - Q1
CALCULATING INTERQUARTILE RANGE FOR EVEN NO OF ELEMENTS:
--------------------------------------------------------
Step 1: Sort the array
Step 2: Get the median of the array
Step 3: Insert the median back to the array
Step 4: Follow odd no procedure since we have an array
of odd size.
*/
template <typename T>
class interquartile_range : public median<T>
private:
bool is_odd_vector;
public:
T get_interquartile_range(std::vector<T> vec, bool sorted = false)
if (sorted == false) std::sort(vec.begin(), vec.end());
if (vec.size() % 2 != 0) is_odd_vector = true;
else is_odd_vector = false;
if (is_odd_vector)
return compute(vec);
else
unsigned long int middle_index = vec.size() / 2;
T median_for_vector = this->get_median(vec);
vec.insert(vec.begin() + middle_index, median_for_vector);
return compute(vec);
private:
T compute(std::vector<T> vec)
unsigned long int middle_element_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i < middle_element_index; i++)
lower_half.push_back(vec[i]);
std::vector<T> upper_half;
for (unsigned long int i = middle_element_index + 1; i<vec.size(); i++)
upper_half.push_back(vec[i]);
T q1 = this->get_median(lower_half);
T q3 = this->get_median(upper_half);
return q3 - q1;
;
//================================================================================
// FREQUENCY MAP TO VECTOR CONVERTER
//================================================================================
template <typename T>
class frequency_map_converter
public:
void to_vector(std::map<T, unsigned long int> frequency_map, std::vector<T> &target_vector)
for (auto element : frequency_map)
for (unsigned long int i = 0; i < element.second; i++) target_vector.push_back(element.first);
;
//================================================================================
// FOR CALCULATING THE RANGE
//================================================================================
/*
HOW TO CALCULATE THE RANGE
--------------------------
sorted input vector = 1, 2, 3, 4
greatest_value = 4
least_value = 1
range = greatest_value - least_value
i.e range = 3
*/
template <typename T>
class range
public:
T get_range(std::vector<T> vec, bool sorted = false)
if (sorted == false) std::sort(vec.begin(), vec.end());
T greatest_value = vec[vec.size() - 1];
T least_value = vec[0];
return greatest_value - least_value;
;
//===============================================================================
// FOR CALCULATING THE QUARTILE
//===============================================================================
template <typename T>
class quartile : public median<T>
public:
std::map<std::string, T> get_quartile(std::vector<T> vec, bool sorted = false)
if (sorted == false) std::sort(vec.begin(), vec.end());
std::map<std::string, T> result;
result["q2"] = this->get_median(vec, sorted = true);
unsigned long int middle_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i<middle_index; i++)
lower_half.push_back(vec[i]);
result["q1"] = this->get_median(lower_half, sorted = true);
// free the memory by clearning the lower half
lower_half.clear();
std::vector<T> upper_half;
if (vec.size() % 2 != 0) middle_index++;
for (unsigned long int i = middle_index; i<vec.size(); i++)
upper_half.push_back(vec[i]);
result["q3"] = this->get_median(upper_half, sorted = true);
return result;
;
// ====================== TEXT TO NUMERICAL DATA CONVERTERS ===================
template <typename T>
class LabelEncoder
private:
long int current_encoded_value = -1;
std::vector<T> array;
std::map < T, long int> encoded_values;
public:
void fit(std::vector<T> array)
this->array = array;
std::sort(array.begin(), array.end());
std::vector<T> sorted_array = array;
for (auto i : sorted_array)
if (encoded_values.find(i) == encoded_values.end())
current_encoded_value++;
encoded_values[i] = current_encoded_value;
std::vector<long int> transform(std::vector<T> array)
std::vector<long int> transformed_array;
for (auto i : array)
transformed_array.push_back(encoded_values[i]);
return transformed_array;
std::vector<long int> transform()
return transform(this->array);
;
/*
Future Milestones - To implement mostlty used vectorizers from scikit-learn:
1. Implement CountVectorizer
2. Implement TfidfVectorizer
*/
// ============================ END OF TEXT TO NUMERICAL ENCODERS ==============================
// ============================ ACTIVATION FUNCTIONS ============================
template <typename T>
class activation_function
public:
T identity(T value)
return value;
long double sigmoid(T value)
T negative_value = -1 * value;
long double exponential = exp(negative_value);
long double result = 1 / (1 + exponential);
return result;
long double tan_h(T value)
long double pos_exp = exp(value);
long double neg_exp = exp(-1 * value);
return (pos_exp - neg_exp) / (pos_exp + neg_exp);
int threshold(T value)
if (value < 0) return 0;
else return 1;
;
This code passed all the unit-tests from the Hacker-rank for some of the statistics exercises.
Most of the ML libraries available in C++ are without documentation so it is hard to understand and implement. So I took this initiative. The main aim is to make is very easy for the end user like scikit-learn does. It is single header so it is easier to be Incorporated in existing projects.
License: Apache 2.0
Documentation: https://github.com/VISWESWARAN1998/statx
c++ statistics machine-learning
add a comment |Â
up vote
2
down vote
favorite
I am developing scikit-learn like implementation for C++ it is in the initial stage while developing I've started doubt myself that is this the correct implementation, since here accuracy is more important than robustness. I am still learning C++ and I would like an expert consultation since the development is at beginning stage I could fix and prevent future errors. Thank you
#include<iostream>
#include<vector>
#include<map>
#include<algorithm>
#include<typeinfo>
#include<cmath>
//===================================================================
// FOR COMPUTING MEAN
//===================================================================
template<typename T>
class mean
public:
T get_mean(std::vector<T> vec)
T total = 0;
for (auto i : vec) total += i;
auto average = total / vec.size();
return average;
;
//===================================================================
// FOR COMPUTING MEDIAN
//===================================================================
template<typename T>
class median
public:
T get_median(std::vector<T> vec, bool sorted = false)
if (sorted == false)std::sort(vec.begin(), vec.end());
auto vector_size = vec.size();
if (vector_size == 1) return vec[0];
// If the elements are odd return the middle element
if (vector_size % 2 != 0) return vec[vector_size / 2];
// If the elements count are even return the average of middle elements
auto middle_element_one = vector_size / 2;
auto middle_element_two = middle_element_one - 1;
auto result = (vec[middle_element_one] + vec[middle_element_two]) / 2;
return result;
;
//====================================================================
// FOR COMPUTING THE MODE
//====================================================================
template<typename T>
class mode
public:
T get_mode(std::vector<T> vec)
std::sort(vec.begin(), vec.end());
std::map<T, unsigned long int> number_count_table;
std::vector<T> elements_with_max_occurences;
unsigned long int bigger_no = 1;
for (auto i : vec)
if (number_count_table.find(i) == number_count_table.end()) number_count_table[i] = 1;
else
auto current_count = number_count_table[i];
current_count++;
if (current_count>bigger_no) bigger_no = current_count;
number_count_table[i] = current_count;
if (bigger_no == 1)
return vec[0];
else
for (auto itr : number_count_table)
if (itr.second == bigger_no) elements_with_max_occurences.push_back(itr.first);
std::sort(elements_with_max_occurences.begin(), elements_with_max_occurences.end());
return elements_with_max_occurences[0];
;
//========================================================================================
// FOR COMPUTING WEIGHTED MEAN
//========================================================================================
template <typename T>
class weighted_mean
private:
unsigned long int vector_size;
public:
T get_weighted_mean(std::vector<T> vec, std::vector<T> weights)
this->vector_size = vec.size();
T numerator = 0;
T total_weight = 0;
for (unsigned long int i = 0; i<vector_size; i++)
T current_value = vec[i] * weights[i];
numerator += current_value;
total_weight += weights[i];
//std::cout << "NUMERATOR: " << numerator << "n";
//std::cout << "DENOMINATOR: " << summation_of_weights << "n";
return numerator / total_weight;
;
//==========================================================================================
// FOR COMPUTING STANDARD DEVIATION
//==========================================================================================
template <typename T>
class standard_deviation : public mean<T>
private:
T mean_value;
T standard_deviation_value;
public:
T get_standard_deviation(std::vector<T> vec)
this->mean_value = this->get_mean(vec);
this->standard_deviation_value = 0;
for (unsigned long int i = 0; i<vec.size(); ++i)
T powered_value = (vec[i] - this->mean_value) * (vec[i] - this->mean_value);
this->standard_deviation_value += powered_value;
this->standard_deviation_value = sqrt(this->standard_deviation_value / vec.size());
return this->standard_deviation_value;
;
//==========================================================================================
// FOR COMPUTING INTERQUARTILE RANGE
//==========================================================================================
/*
CALCULATING INTERQUARTILE RANGE FOR ODD NO OF ELEMENTS:
-------------------------------------------------------
Given Elements(E) = 3,5,5,7,8,8,9,10,13,13,14,15,16,22,23
Step 1: Sort it! Here we already have an sorted values
E = 3, 5, 5, 7, 8, 8, 9, 10, 13, 13, 14, 15, 16, 22, 23
Step 2: Get the median for the values
Median (M) = 10
Step 3: The lower and the upper half
Elements which are left to the median are called left half and
to the right is right half
3 5 5 7 8 8 9 10 13 13 14 15 16 22 23
|___________| | |__________________|
LOWER HALF MEDIAN UPPER HALF
Step 4:
Q1 = median of left half
Q3 = median of right half
interquartile_range = Q3 - Q1
CALCULATING INTERQUARTILE RANGE FOR EVEN NO OF ELEMENTS:
--------------------------------------------------------
Step 1: Sort the array
Step 2: Get the median of the array
Step 3: Insert the median back to the array
Step 4: Follow odd no procedure since we have an array
of odd size.
*/
template <typename T>
class interquartile_range : public median<T>
private:
bool is_odd_vector;
public:
T get_interquartile_range(std::vector<T> vec, bool sorted = false)
if (sorted == false) std::sort(vec.begin(), vec.end());
if (vec.size() % 2 != 0) is_odd_vector = true;
else is_odd_vector = false;
if (is_odd_vector)
return compute(vec);
else
unsigned long int middle_index = vec.size() / 2;
T median_for_vector = this->get_median(vec);
vec.insert(vec.begin() + middle_index, median_for_vector);
return compute(vec);
private:
T compute(std::vector<T> vec)
unsigned long int middle_element_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i < middle_element_index; i++)
lower_half.push_back(vec[i]);
std::vector<T> upper_half;
for (unsigned long int i = middle_element_index + 1; i<vec.size(); i++)
upper_half.push_back(vec[i]);
T q1 = this->get_median(lower_half);
T q3 = this->get_median(upper_half);
return q3 - q1;
;
//================================================================================
// FREQUENCY MAP TO VECTOR CONVERTER
//================================================================================
template <typename T>
class frequency_map_converter
public:
void to_vector(std::map<T, unsigned long int> frequency_map, std::vector<T> &target_vector)
for (auto element : frequency_map)
for (unsigned long int i = 0; i < element.second; i++) target_vector.push_back(element.first);
;
//================================================================================
// FOR CALCULATING THE RANGE
//================================================================================
/*
HOW TO CALCULATE THE RANGE
--------------------------
sorted input vector = 1, 2, 3, 4
greatest_value = 4
least_value = 1
range = greatest_value - least_value
i.e range = 3
*/
template <typename T>
class range
public:
T get_range(std::vector<T> vec, bool sorted = false)
if (sorted == false) std::sort(vec.begin(), vec.end());
T greatest_value = vec[vec.size() - 1];
T least_value = vec[0];
return greatest_value - least_value;
;
//===============================================================================
// FOR CALCULATING THE QUARTILE
//===============================================================================
template <typename T>
class quartile : public median<T>
public:
std::map<std::string, T> get_quartile(std::vector<T> vec, bool sorted = false)
if (sorted == false) std::sort(vec.begin(), vec.end());
std::map<std::string, T> result;
result["q2"] = this->get_median(vec, sorted = true);
unsigned long int middle_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i<middle_index; i++)
lower_half.push_back(vec[i]);
result["q1"] = this->get_median(lower_half, sorted = true);
// free the memory by clearning the lower half
lower_half.clear();
std::vector<T> upper_half;
if (vec.size() % 2 != 0) middle_index++;
for (unsigned long int i = middle_index; i<vec.size(); i++)
upper_half.push_back(vec[i]);
result["q3"] = this->get_median(upper_half, sorted = true);
return result;
;
// ====================== TEXT TO NUMERICAL DATA CONVERTERS ===================
template <typename T>
class LabelEncoder
private:
long int current_encoded_value = -1;
std::vector<T> array;
std::map < T, long int> encoded_values;
public:
void fit(std::vector<T> array)
this->array = array;
std::sort(array.begin(), array.end());
std::vector<T> sorted_array = array;
for (auto i : sorted_array)
if (encoded_values.find(i) == encoded_values.end())
current_encoded_value++;
encoded_values[i] = current_encoded_value;
std::vector<long int> transform(std::vector<T> array)
std::vector<long int> transformed_array;
for (auto i : array)
transformed_array.push_back(encoded_values[i]);
return transformed_array;
std::vector<long int> transform()
return transform(this->array);
;
/*
Future Milestones - To implement mostlty used vectorizers from scikit-learn:
1. Implement CountVectorizer
2. Implement TfidfVectorizer
*/
// ============================ END OF TEXT TO NUMERICAL ENCODERS ==============================
// ============================ ACTIVATION FUNCTIONS ============================
template <typename T>
class activation_function
public:
T identity(T value)
return value;
long double sigmoid(T value)
T negative_value = -1 * value;
long double exponential = exp(negative_value);
long double result = 1 / (1 + exponential);
return result;
long double tan_h(T value)
long double pos_exp = exp(value);
long double neg_exp = exp(-1 * value);
return (pos_exp - neg_exp) / (pos_exp + neg_exp);
int threshold(T value)
if (value < 0) return 0;
else return 1;
;
This code passed all the unit-tests from the Hacker-rank for some of the statistics exercises.
Most of the ML libraries available in C++ are without documentation so it is hard to understand and implement. So I took this initiative. The main aim is to make is very easy for the end user like scikit-learn does. It is single header so it is easier to be Incorporated in existing projects.
License: Apache 2.0
Documentation: https://github.com/VISWESWARAN1998/statx
c++ statistics machine-learning
3
you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
â Vogel612â¦
Jul 14 at 17:15
add a comment |Â
up vote
2
down vote
favorite
up vote
2
down vote
favorite
I am developing scikit-learn like implementation for C++ it is in the initial stage while developing I've started doubt myself that is this the correct implementation, since here accuracy is more important than robustness. I am still learning C++ and I would like an expert consultation since the development is at beginning stage I could fix and prevent future errors. Thank you
#include<iostream>
#include<vector>
#include<map>
#include<algorithm>
#include<typeinfo>
#include<cmath>
//===================================================================
// FOR COMPUTING MEAN
//===================================================================
template<typename T>
class mean
public:
T get_mean(std::vector<T> vec)
T total = 0;
for (auto i : vec) total += i;
auto average = total / vec.size();
return average;
;
//===================================================================
// FOR COMPUTING MEDIAN
//===================================================================
template<typename T>
class median
public:
T get_median(std::vector<T> vec, bool sorted = false)
if (sorted == false)std::sort(vec.begin(), vec.end());
auto vector_size = vec.size();
if (vector_size == 1) return vec[0];
// If the elements are odd return the middle element
if (vector_size % 2 != 0) return vec[vector_size / 2];
// If the elements count are even return the average of middle elements
auto middle_element_one = vector_size / 2;
auto middle_element_two = middle_element_one - 1;
auto result = (vec[middle_element_one] + vec[middle_element_two]) / 2;
return result;
;
//====================================================================
// FOR COMPUTING THE MODE
//====================================================================
template<typename T>
class mode
public:
T get_mode(std::vector<T> vec)
std::sort(vec.begin(), vec.end());
std::map<T, unsigned long int> number_count_table;
std::vector<T> elements_with_max_occurences;
unsigned long int bigger_no = 1;
for (auto i : vec)
if (number_count_table.find(i) == number_count_table.end()) number_count_table[i] = 1;
else
auto current_count = number_count_table[i];
current_count++;
if (current_count>bigger_no) bigger_no = current_count;
number_count_table[i] = current_count;
if (bigger_no == 1)
return vec[0];
else
for (auto itr : number_count_table)
if (itr.second == bigger_no) elements_with_max_occurences.push_back(itr.first);
std::sort(elements_with_max_occurences.begin(), elements_with_max_occurences.end());
return elements_with_max_occurences[0];
;
//========================================================================================
// FOR COMPUTING WEIGHTED MEAN
//========================================================================================
template <typename T>
class weighted_mean
private:
unsigned long int vector_size;
public:
T get_weighted_mean(std::vector<T> vec, std::vector<T> weights)
this->vector_size = vec.size();
T numerator = 0;
T total_weight = 0;
for (unsigned long int i = 0; i<vector_size; i++)
T current_value = vec[i] * weights[i];
numerator += current_value;
total_weight += weights[i];
//std::cout << "NUMERATOR: " << numerator << "n";
//std::cout << "DENOMINATOR: " << summation_of_weights << "n";
return numerator / total_weight;
;
//==========================================================================================
// FOR COMPUTING STANDARD DEVIATION
//==========================================================================================
template <typename T>
class standard_deviation : public mean<T>
private:
T mean_value;
T standard_deviation_value;
public:
T get_standard_deviation(std::vector<T> vec)
this->mean_value = this->get_mean(vec);
this->standard_deviation_value = 0;
for (unsigned long int i = 0; i<vec.size(); ++i)
T powered_value = (vec[i] - this->mean_value) * (vec[i] - this->mean_value);
this->standard_deviation_value += powered_value;
this->standard_deviation_value = sqrt(this->standard_deviation_value / vec.size());
return this->standard_deviation_value;
;
//==========================================================================================
// FOR COMPUTING INTERQUARTILE RANGE
//==========================================================================================
/*
CALCULATING INTERQUARTILE RANGE FOR ODD NO OF ELEMENTS:
-------------------------------------------------------
Given Elements(E) = 3,5,5,7,8,8,9,10,13,13,14,15,16,22,23
Step 1: Sort it! Here we already have an sorted values
E = 3, 5, 5, 7, 8, 8, 9, 10, 13, 13, 14, 15, 16, 22, 23
Step 2: Get the median for the values
Median (M) = 10
Step 3: The lower and the upper half
Elements which are left to the median are called left half and
to the right is right half
3 5 5 7 8 8 9 10 13 13 14 15 16 22 23
|___________| | |__________________|
LOWER HALF MEDIAN UPPER HALF
Step 4:
Q1 = median of left half
Q3 = median of right half
interquartile_range = Q3 - Q1
CALCULATING INTERQUARTILE RANGE FOR EVEN NO OF ELEMENTS:
--------------------------------------------------------
Step 1: Sort the array
Step 2: Get the median of the array
Step 3: Insert the median back to the array
Step 4: Follow odd no procedure since we have an array
of odd size.
*/
template <typename T>
class interquartile_range : public median<T>
private:
bool is_odd_vector;
public:
T get_interquartile_range(std::vector<T> vec, bool sorted = false)
if (sorted == false) std::sort(vec.begin(), vec.end());
if (vec.size() % 2 != 0) is_odd_vector = true;
else is_odd_vector = false;
if (is_odd_vector)
return compute(vec);
else
unsigned long int middle_index = vec.size() / 2;
T median_for_vector = this->get_median(vec);
vec.insert(vec.begin() + middle_index, median_for_vector);
return compute(vec);
private:
T compute(std::vector<T> vec)
unsigned long int middle_element_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i < middle_element_index; i++)
lower_half.push_back(vec[i]);
std::vector<T> upper_half;
for (unsigned long int i = middle_element_index + 1; i<vec.size(); i++)
upper_half.push_back(vec[i]);
T q1 = this->get_median(lower_half);
T q3 = this->get_median(upper_half);
return q3 - q1;
;
//================================================================================
// FREQUENCY MAP TO VECTOR CONVERTER
//================================================================================
template <typename T>
class frequency_map_converter
public:
void to_vector(std::map<T, unsigned long int> frequency_map, std::vector<T> &target_vector)
for (auto element : frequency_map)
for (unsigned long int i = 0; i < element.second; i++) target_vector.push_back(element.first);
;
//================================================================================
// FOR CALCULATING THE RANGE
//================================================================================
/*
HOW TO CALCULATE THE RANGE
--------------------------
sorted input vector = 1, 2, 3, 4
greatest_value = 4
least_value = 1
range = greatest_value - least_value
i.e range = 3
*/
template <typename T>
class range
public:
T get_range(std::vector<T> vec, bool sorted = false)
if (sorted == false) std::sort(vec.begin(), vec.end());
T greatest_value = vec[vec.size() - 1];
T least_value = vec[0];
return greatest_value - least_value;
;
//===============================================================================
// FOR CALCULATING THE QUARTILE
//===============================================================================
template <typename T>
class quartile : public median<T>
public:
std::map<std::string, T> get_quartile(std::vector<T> vec, bool sorted = false)
if (sorted == false) std::sort(vec.begin(), vec.end());
std::map<std::string, T> result;
result["q2"] = this->get_median(vec, sorted = true);
unsigned long int middle_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i<middle_index; i++)
lower_half.push_back(vec[i]);
result["q1"] = this->get_median(lower_half, sorted = true);
// free the memory by clearning the lower half
lower_half.clear();
std::vector<T> upper_half;
if (vec.size() % 2 != 0) middle_index++;
for (unsigned long int i = middle_index; i<vec.size(); i++)
upper_half.push_back(vec[i]);
result["q3"] = this->get_median(upper_half, sorted = true);
return result;
;
// ====================== TEXT TO NUMERICAL DATA CONVERTERS ===================
template <typename T>
class LabelEncoder
private:
long int current_encoded_value = -1;
std::vector<T> array;
std::map < T, long int> encoded_values;
public:
void fit(std::vector<T> array)
this->array = array;
std::sort(array.begin(), array.end());
std::vector<T> sorted_array = array;
for (auto i : sorted_array)
if (encoded_values.find(i) == encoded_values.end())
current_encoded_value++;
encoded_values[i] = current_encoded_value;
std::vector<long int> transform(std::vector<T> array)
std::vector<long int> transformed_array;
for (auto i : array)
transformed_array.push_back(encoded_values[i]);
return transformed_array;
std::vector<long int> transform()
return transform(this->array);
;
/*
Future Milestones - To implement mostlty used vectorizers from scikit-learn:
1. Implement CountVectorizer
2. Implement TfidfVectorizer
*/
// ============================ END OF TEXT TO NUMERICAL ENCODERS ==============================
// ============================ ACTIVATION FUNCTIONS ============================
template <typename T>
class activation_function
public:
T identity(T value)
return value;
long double sigmoid(T value)
T negative_value = -1 * value;
long double exponential = exp(negative_value);
long double result = 1 / (1 + exponential);
return result;
long double tan_h(T value)
long double pos_exp = exp(value);
long double neg_exp = exp(-1 * value);
return (pos_exp - neg_exp) / (pos_exp + neg_exp);
int threshold(T value)
if (value < 0) return 0;
else return 1;
;
This code passed all the unit-tests from the Hacker-rank for some of the statistics exercises.
Most of the ML libraries available in C++ are without documentation so it is hard to understand and implement. So I took this initiative. The main aim is to make is very easy for the end user like scikit-learn does. It is single header so it is easier to be Incorporated in existing projects.
License: Apache 2.0
Documentation: https://github.com/VISWESWARAN1998/statx
c++ statistics machine-learning
I am developing scikit-learn like implementation for C++ it is in the initial stage while developing I've started doubt myself that is this the correct implementation, since here accuracy is more important than robustness. I am still learning C++ and I would like an expert consultation since the development is at beginning stage I could fix and prevent future errors. Thank you
#include<iostream>
#include<vector>
#include<map>
#include<algorithm>
#include<typeinfo>
#include<cmath>
//===================================================================
// FOR COMPUTING MEAN
//===================================================================
template<typename T>
class mean
public:
T get_mean(std::vector<T> vec)
T total = 0;
for (auto i : vec) total += i;
auto average = total / vec.size();
return average;
;
//===================================================================
// FOR COMPUTING MEDIAN
//===================================================================
template<typename T>
class median
public:
T get_median(std::vector<T> vec, bool sorted = false)
if (sorted == false)std::sort(vec.begin(), vec.end());
auto vector_size = vec.size();
if (vector_size == 1) return vec[0];
// If the elements are odd return the middle element
if (vector_size % 2 != 0) return vec[vector_size / 2];
// If the elements count are even return the average of middle elements
auto middle_element_one = vector_size / 2;
auto middle_element_two = middle_element_one - 1;
auto result = (vec[middle_element_one] + vec[middle_element_two]) / 2;
return result;
;
//====================================================================
// FOR COMPUTING THE MODE
//====================================================================
template<typename T>
class mode
public:
T get_mode(std::vector<T> vec)
std::sort(vec.begin(), vec.end());
std::map<T, unsigned long int> number_count_table;
std::vector<T> elements_with_max_occurences;
unsigned long int bigger_no = 1;
for (auto i : vec)
if (number_count_table.find(i) == number_count_table.end()) number_count_table[i] = 1;
else
auto current_count = number_count_table[i];
current_count++;
if (current_count>bigger_no) bigger_no = current_count;
number_count_table[i] = current_count;
if (bigger_no == 1)
return vec[0];
else
for (auto itr : number_count_table)
if (itr.second == bigger_no) elements_with_max_occurences.push_back(itr.first);
std::sort(elements_with_max_occurences.begin(), elements_with_max_occurences.end());
return elements_with_max_occurences[0];
;
//========================================================================================
// FOR COMPUTING WEIGHTED MEAN
//========================================================================================
template <typename T>
class weighted_mean
private:
unsigned long int vector_size;
public:
T get_weighted_mean(std::vector<T> vec, std::vector<T> weights)
this->vector_size = vec.size();
T numerator = 0;
T total_weight = 0;
for (unsigned long int i = 0; i<vector_size; i++)
T current_value = vec[i] * weights[i];
numerator += current_value;
total_weight += weights[i];
//std::cout << "NUMERATOR: " << numerator << "n";
//std::cout << "DENOMINATOR: " << summation_of_weights << "n";
return numerator / total_weight;
;
//==========================================================================================
// FOR COMPUTING STANDARD DEVIATION
//==========================================================================================
template <typename T>
class standard_deviation : public mean<T>
private:
T mean_value;
T standard_deviation_value;
public:
T get_standard_deviation(std::vector<T> vec)
this->mean_value = this->get_mean(vec);
this->standard_deviation_value = 0;
for (unsigned long int i = 0; i<vec.size(); ++i)
T powered_value = (vec[i] - this->mean_value) * (vec[i] - this->mean_value);
this->standard_deviation_value += powered_value;
this->standard_deviation_value = sqrt(this->standard_deviation_value / vec.size());
return this->standard_deviation_value;
;
//==========================================================================================
// FOR COMPUTING INTERQUARTILE RANGE
//==========================================================================================
/*
CALCULATING INTERQUARTILE RANGE FOR ODD NO OF ELEMENTS:
-------------------------------------------------------
Given Elements(E) = 3,5,5,7,8,8,9,10,13,13,14,15,16,22,23
Step 1: Sort it! Here we already have an sorted values
E = 3, 5, 5, 7, 8, 8, 9, 10, 13, 13, 14, 15, 16, 22, 23
Step 2: Get the median for the values
Median (M) = 10
Step 3: The lower and the upper half
Elements which are left to the median are called left half and
to the right is right half
3 5 5 7 8 8 9 10 13 13 14 15 16 22 23
|___________| | |__________________|
LOWER HALF MEDIAN UPPER HALF
Step 4:
Q1 = median of left half
Q3 = median of right half
interquartile_range = Q3 - Q1
CALCULATING INTERQUARTILE RANGE FOR EVEN NO OF ELEMENTS:
--------------------------------------------------------
Step 1: Sort the array
Step 2: Get the median of the array
Step 3: Insert the median back to the array
Step 4: Follow odd no procedure since we have an array
of odd size.
*/
template <typename T>
class interquartile_range : public median<T>
private:
bool is_odd_vector;
public:
T get_interquartile_range(std::vector<T> vec, bool sorted = false)
if (sorted == false) std::sort(vec.begin(), vec.end());
if (vec.size() % 2 != 0) is_odd_vector = true;
else is_odd_vector = false;
if (is_odd_vector)
return compute(vec);
else
unsigned long int middle_index = vec.size() / 2;
T median_for_vector = this->get_median(vec);
vec.insert(vec.begin() + middle_index, median_for_vector);
return compute(vec);
private:
T compute(std::vector<T> vec)
unsigned long int middle_element_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i < middle_element_index; i++)
lower_half.push_back(vec[i]);
std::vector<T> upper_half;
for (unsigned long int i = middle_element_index + 1; i<vec.size(); i++)
upper_half.push_back(vec[i]);
T q1 = this->get_median(lower_half);
T q3 = this->get_median(upper_half);
return q3 - q1;
;
//================================================================================
// FREQUENCY MAP TO VECTOR CONVERTER
//================================================================================
template <typename T>
class frequency_map_converter
public:
void to_vector(std::map<T, unsigned long int> frequency_map, std::vector<T> &target_vector)
for (auto element : frequency_map)
for (unsigned long int i = 0; i < element.second; i++) target_vector.push_back(element.first);
;
//================================================================================
// FOR CALCULATING THE RANGE
//================================================================================
/*
HOW TO CALCULATE THE RANGE
--------------------------
sorted input vector = 1, 2, 3, 4
greatest_value = 4
least_value = 1
range = greatest_value - least_value
i.e range = 3
*/
template <typename T>
class range
public:
T get_range(std::vector<T> vec, bool sorted = false)
if (sorted == false) std::sort(vec.begin(), vec.end());
T greatest_value = vec[vec.size() - 1];
T least_value = vec[0];
return greatest_value - least_value;
;
//===============================================================================
// FOR CALCULATING THE QUARTILE
//===============================================================================
template <typename T>
class quartile : public median<T>
public:
std::map<std::string, T> get_quartile(std::vector<T> vec, bool sorted = false)
if (sorted == false) std::sort(vec.begin(), vec.end());
std::map<std::string, T> result;
result["q2"] = this->get_median(vec, sorted = true);
unsigned long int middle_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i<middle_index; i++)
lower_half.push_back(vec[i]);
result["q1"] = this->get_median(lower_half, sorted = true);
// free the memory by clearning the lower half
lower_half.clear();
std::vector<T> upper_half;
if (vec.size() % 2 != 0) middle_index++;
for (unsigned long int i = middle_index; i<vec.size(); i++)
upper_half.push_back(vec[i]);
result["q3"] = this->get_median(upper_half, sorted = true);
return result;
;
// ====================== TEXT TO NUMERICAL DATA CONVERTERS ===================
template <typename T>
class LabelEncoder
private:
long int current_encoded_value = -1;
std::vector<T> array;
std::map < T, long int> encoded_values;
public:
void fit(std::vector<T> array)
this->array = array;
std::sort(array.begin(), array.end());
std::vector<T> sorted_array = array;
for (auto i : sorted_array)
if (encoded_values.find(i) == encoded_values.end())
current_encoded_value++;
encoded_values[i] = current_encoded_value;
std::vector<long int> transform(std::vector<T> array)
std::vector<long int> transformed_array;
for (auto i : array)
transformed_array.push_back(encoded_values[i]);
return transformed_array;
std::vector<long int> transform()
return transform(this->array);
;
/*
Future Milestones - To implement mostlty used vectorizers from scikit-learn:
1. Implement CountVectorizer
2. Implement TfidfVectorizer
*/
// ============================ END OF TEXT TO NUMERICAL ENCODERS ==============================
// ============================ ACTIVATION FUNCTIONS ============================
template <typename T>
class activation_function
public:
T identity(T value)
return value;
long double sigmoid(T value)
T negative_value = -1 * value;
long double exponential = exp(negative_value);
long double result = 1 / (1 + exponential);
return result;
long double tan_h(T value)
long double pos_exp = exp(value);
long double neg_exp = exp(-1 * value);
return (pos_exp - neg_exp) / (pos_exp + neg_exp);
int threshold(T value)
if (value < 0) return 0;
else return 1;
;
This code passed all the unit-tests from the Hacker-rank for some of the statistics exercises.
Most of the ML libraries available in C++ are without documentation so it is hard to understand and implement. So I took this initiative. The main aim is to make is very easy for the end user like scikit-learn does. It is single header so it is easier to be Incorporated in existing projects.
License: Apache 2.0
Documentation: https://github.com/VISWESWARAN1998/statx
c++ statistics machine-learning
asked Jul 14 at 15:56
VISWESWARAN NAGASIVAM
192214
192214
3
you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
â Vogel612â¦
Jul 14 at 17:15
add a comment |Â
3
you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
â Vogel612â¦
Jul 14 at 17:15
3
3
you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
â Vogel612â¦
Jul 14 at 17:15
you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
â Vogel612â¦
Jul 14 at 17:15
add a comment |Â
1 Answer
1
active
oldest
votes
up vote
4
down vote
accepted
Wrapper classes
I don't get why many algorithms (like get_mean
, get_median
, ...) are wrapped inside a class that is basically just a fancy namespace
.
Why force the user to write mean.get_mean(...)
if mean(...)
would suffice?
Repetition
In many functions, there is a repeating pattern: The function takes a bool sorted
parameter, and the first statement is if(!sorted) std::sort(...);
.
First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.
Second, in many cases it isn't actually necessary to fully sort the std::vector
($mathcalO(n log n)$): There might be alternatives (like std::nth_element
, $mathcalO(n)$) that can do the intended job and have better performance.
Last, why pass explicitly std::vector<T>
? Not only does this force a copy of all the underlying data, many of those algorithms are general enough so that they should work on other data structures, too! Taking a pair of iterators also means that they can be applied to parts of containers (e.g. simplifying the quartile calculation).
For example, fixing these issues on median::get_medain
template<typename Iter>
auto median_of_sorted(Iter begin, Iter end)
auto size = std::distance(begin, end);
if(size % 2 != 0)
return *(begin + size / 2);
auto middle_low = *(begin + (size - 1) / 2);
auto middle_high = *(begin + (size + 1) / 2);
return (middle_low + middle_high) / 2;
template<typename Iter>
auto median(Iter begin, Iter end)
auto size = std::distance(begin, end);
if(size % 2 != 0)
auto middle = begin + size / 2;
std::nth_element(begin, middle, end);
return *middle;
auto middle_low = begin + (size - 1) / 2;
auto middle_high = begin + (size + 1) / 2;
std::nth_element(begin, middle_high, end);
std::nth_element(begin, middle_low, middle_high);
return (*middle_low + *middle_high) / 2;
median_of_sorted
has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.
median
has $mathcalO(n)$ runtime complexity and might modify the container.
Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.
Note: The code from
median
/median_of_sorted
can easily be extended to calculate any percentile.
Other issues
unsigned long int
might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please useuint32_t
oruint64_t
from the header<cstdint>
. If the bit length itself is not relevant, maybe usesize_t
(or simplyunsigned
) instead to prevent confusion.Do you need the ordering of
std::map
? If not, you might want to usestd::unordered_map
instead (hash map instead of binary search tree).Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.
add a comment |Â
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
4
down vote
accepted
Wrapper classes
I don't get why many algorithms (like get_mean
, get_median
, ...) are wrapped inside a class that is basically just a fancy namespace
.
Why force the user to write mean.get_mean(...)
if mean(...)
would suffice?
Repetition
In many functions, there is a repeating pattern: The function takes a bool sorted
parameter, and the first statement is if(!sorted) std::sort(...);
.
First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.
Second, in many cases it isn't actually necessary to fully sort the std::vector
($mathcalO(n log n)$): There might be alternatives (like std::nth_element
, $mathcalO(n)$) that can do the intended job and have better performance.
Last, why pass explicitly std::vector<T>
? Not only does this force a copy of all the underlying data, many of those algorithms are general enough so that they should work on other data structures, too! Taking a pair of iterators also means that they can be applied to parts of containers (e.g. simplifying the quartile calculation).
For example, fixing these issues on median::get_medain
template<typename Iter>
auto median_of_sorted(Iter begin, Iter end)
auto size = std::distance(begin, end);
if(size % 2 != 0)
return *(begin + size / 2);
auto middle_low = *(begin + (size - 1) / 2);
auto middle_high = *(begin + (size + 1) / 2);
return (middle_low + middle_high) / 2;
template<typename Iter>
auto median(Iter begin, Iter end)
auto size = std::distance(begin, end);
if(size % 2 != 0)
auto middle = begin + size / 2;
std::nth_element(begin, middle, end);
return *middle;
auto middle_low = begin + (size - 1) / 2;
auto middle_high = begin + (size + 1) / 2;
std::nth_element(begin, middle_high, end);
std::nth_element(begin, middle_low, middle_high);
return (*middle_low + *middle_high) / 2;
median_of_sorted
has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.
median
has $mathcalO(n)$ runtime complexity and might modify the container.
Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.
Note: The code from
median
/median_of_sorted
can easily be extended to calculate any percentile.
Other issues
unsigned long int
might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please useuint32_t
oruint64_t
from the header<cstdint>
. If the bit length itself is not relevant, maybe usesize_t
(or simplyunsigned
) instead to prevent confusion.Do you need the ordering of
std::map
? If not, you might want to usestd::unordered_map
instead (hash map instead of binary search tree).Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.
add a comment |Â
up vote
4
down vote
accepted
Wrapper classes
I don't get why many algorithms (like get_mean
, get_median
, ...) are wrapped inside a class that is basically just a fancy namespace
.
Why force the user to write mean.get_mean(...)
if mean(...)
would suffice?
Repetition
In many functions, there is a repeating pattern: The function takes a bool sorted
parameter, and the first statement is if(!sorted) std::sort(...);
.
First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.
Second, in many cases it isn't actually necessary to fully sort the std::vector
($mathcalO(n log n)$): There might be alternatives (like std::nth_element
, $mathcalO(n)$) that can do the intended job and have better performance.
Last, why pass explicitly std::vector<T>
? Not only does this force a copy of all the underlying data, many of those algorithms are general enough so that they should work on other data structures, too! Taking a pair of iterators also means that they can be applied to parts of containers (e.g. simplifying the quartile calculation).
For example, fixing these issues on median::get_medain
template<typename Iter>
auto median_of_sorted(Iter begin, Iter end)
auto size = std::distance(begin, end);
if(size % 2 != 0)
return *(begin + size / 2);
auto middle_low = *(begin + (size - 1) / 2);
auto middle_high = *(begin + (size + 1) / 2);
return (middle_low + middle_high) / 2;
template<typename Iter>
auto median(Iter begin, Iter end)
auto size = std::distance(begin, end);
if(size % 2 != 0)
auto middle = begin + size / 2;
std::nth_element(begin, middle, end);
return *middle;
auto middle_low = begin + (size - 1) / 2;
auto middle_high = begin + (size + 1) / 2;
std::nth_element(begin, middle_high, end);
std::nth_element(begin, middle_low, middle_high);
return (*middle_low + *middle_high) / 2;
median_of_sorted
has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.
median
has $mathcalO(n)$ runtime complexity and might modify the container.
Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.
Note: The code from
median
/median_of_sorted
can easily be extended to calculate any percentile.
Other issues
unsigned long int
might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please useuint32_t
oruint64_t
from the header<cstdint>
. If the bit length itself is not relevant, maybe usesize_t
(or simplyunsigned
) instead to prevent confusion.Do you need the ordering of
std::map
? If not, you might want to usestd::unordered_map
instead (hash map instead of binary search tree).Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.
add a comment |Â
up vote
4
down vote
accepted
up vote
4
down vote
accepted
Wrapper classes
I don't get why many algorithms (like get_mean
, get_median
, ...) are wrapped inside a class that is basically just a fancy namespace
.
Why force the user to write mean.get_mean(...)
if mean(...)
would suffice?
Repetition
In many functions, there is a repeating pattern: The function takes a bool sorted
parameter, and the first statement is if(!sorted) std::sort(...);
.
First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.
Second, in many cases it isn't actually necessary to fully sort the std::vector
($mathcalO(n log n)$): There might be alternatives (like std::nth_element
, $mathcalO(n)$) that can do the intended job and have better performance.
Last, why pass explicitly std::vector<T>
? Not only does this force a copy of all the underlying data, many of those algorithms are general enough so that they should work on other data structures, too! Taking a pair of iterators also means that they can be applied to parts of containers (e.g. simplifying the quartile calculation).
For example, fixing these issues on median::get_medain
template<typename Iter>
auto median_of_sorted(Iter begin, Iter end)
auto size = std::distance(begin, end);
if(size % 2 != 0)
return *(begin + size / 2);
auto middle_low = *(begin + (size - 1) / 2);
auto middle_high = *(begin + (size + 1) / 2);
return (middle_low + middle_high) / 2;
template<typename Iter>
auto median(Iter begin, Iter end)
auto size = std::distance(begin, end);
if(size % 2 != 0)
auto middle = begin + size / 2;
std::nth_element(begin, middle, end);
return *middle;
auto middle_low = begin + (size - 1) / 2;
auto middle_high = begin + (size + 1) / 2;
std::nth_element(begin, middle_high, end);
std::nth_element(begin, middle_low, middle_high);
return (*middle_low + *middle_high) / 2;
median_of_sorted
has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.
median
has $mathcalO(n)$ runtime complexity and might modify the container.
Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.
Note: The code from
median
/median_of_sorted
can easily be extended to calculate any percentile.
Other issues
unsigned long int
might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please useuint32_t
oruint64_t
from the header<cstdint>
. If the bit length itself is not relevant, maybe usesize_t
(or simplyunsigned
) instead to prevent confusion.Do you need the ordering of
std::map
? If not, you might want to usestd::unordered_map
instead (hash map instead of binary search tree).Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.
Wrapper classes
I don't get why many algorithms (like get_mean
, get_median
, ...) are wrapped inside a class that is basically just a fancy namespace
.
Why force the user to write mean.get_mean(...)
if mean(...)
would suffice?
Repetition
In many functions, there is a repeating pattern: The function takes a bool sorted
parameter, and the first statement is if(!sorted) std::sort(...);
.
First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.
Second, in many cases it isn't actually necessary to fully sort the std::vector
($mathcalO(n log n)$): There might be alternatives (like std::nth_element
, $mathcalO(n)$) that can do the intended job and have better performance.
Last, why pass explicitly std::vector<T>
? Not only does this force a copy of all the underlying data, many of those algorithms are general enough so that they should work on other data structures, too! Taking a pair of iterators also means that they can be applied to parts of containers (e.g. simplifying the quartile calculation).
For example, fixing these issues on median::get_medain
template<typename Iter>
auto median_of_sorted(Iter begin, Iter end)
auto size = std::distance(begin, end);
if(size % 2 != 0)
return *(begin + size / 2);
auto middle_low = *(begin + (size - 1) / 2);
auto middle_high = *(begin + (size + 1) / 2);
return (middle_low + middle_high) / 2;
template<typename Iter>
auto median(Iter begin, Iter end)
auto size = std::distance(begin, end);
if(size % 2 != 0)
auto middle = begin + size / 2;
std::nth_element(begin, middle, end);
return *middle;
auto middle_low = begin + (size - 1) / 2;
auto middle_high = begin + (size + 1) / 2;
std::nth_element(begin, middle_high, end);
std::nth_element(begin, middle_low, middle_high);
return (*middle_low + *middle_high) / 2;
median_of_sorted
has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.
median
has $mathcalO(n)$ runtime complexity and might modify the container.
Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.
Note: The code from
median
/median_of_sorted
can easily be extended to calculate any percentile.
Other issues
unsigned long int
might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please useuint32_t
oruint64_t
from the header<cstdint>
. If the bit length itself is not relevant, maybe usesize_t
(or simplyunsigned
) instead to prevent confusion.Do you need the ordering of
std::map
? If not, you might want to usestd::unordered_map
instead (hash map instead of binary search tree).Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.
edited Jul 16 at 16:07
answered Jul 14 at 17:16
hoffmale
4,205630
4,205630
add a comment |Â
add a comment |Â
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f199493%2fbasic-single-header-statistics-and-ml-libray-for-c-scikit-learn-like-impleme%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
3
you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
â Vogel612â¦
Jul 14 at 17:15