Basic Single Header statistics and ml libray for C++ - Scikit-Learn like implementation

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
2
down vote

favorite












I am developing scikit-learn like implementation for C++ it is in the initial stage while developing I've started doubt myself that is this the correct implementation, since here accuracy is more important than robustness. I am still learning C++ and I would like an expert consultation since the development is at beginning stage I could fix and prevent future errors. Thank you



#include<iostream>
#include<vector>
#include<map>
#include<algorithm>
#include<typeinfo>
#include<cmath>

//===================================================================
// FOR COMPUTING MEAN
//===================================================================

template<typename T>
class mean

public:
T get_mean(std::vector<T> vec)

T total = 0;
for (auto i : vec) total += i;
auto average = total / vec.size();
return average;


;


//===================================================================
// FOR COMPUTING MEDIAN
//===================================================================
template<typename T>
class median

public:
T get_median(std::vector<T> vec, bool sorted = false)

if (sorted == false)std::sort(vec.begin(), vec.end());
auto vector_size = vec.size();
if (vector_size == 1) return vec[0];
// If the elements are odd return the middle element
if (vector_size % 2 != 0) return vec[vector_size / 2];
// If the elements count are even return the average of middle elements
auto middle_element_one = vector_size / 2;
auto middle_element_two = middle_element_one - 1;
auto result = (vec[middle_element_one] + vec[middle_element_two]) / 2;
return result;


;


//====================================================================
// FOR COMPUTING THE MODE
//====================================================================

template<typename T>
class mode

public:
T get_mode(std::vector<T> vec)

std::sort(vec.begin(), vec.end());
std::map<T, unsigned long int> number_count_table;
std::vector<T> elements_with_max_occurences;
unsigned long int bigger_no = 1;
for (auto i : vec)

if (number_count_table.find(i) == number_count_table.end()) number_count_table[i] = 1;
else
auto current_count = number_count_table[i];
current_count++;
if (current_count>bigger_no) bigger_no = current_count;
number_count_table[i] = current_count;


if (bigger_no == 1)
return vec[0];

else

for (auto itr : number_count_table)

if (itr.second == bigger_no) elements_with_max_occurences.push_back(itr.first);

std::sort(elements_with_max_occurences.begin(), elements_with_max_occurences.end());
return elements_with_max_occurences[0];


;



//========================================================================================
// FOR COMPUTING WEIGHTED MEAN
//========================================================================================

template <typename T>
class weighted_mean

private:
unsigned long int vector_size;
public:
T get_weighted_mean(std::vector<T> vec, std::vector<T> weights)

this->vector_size = vec.size();
T numerator = 0;
T total_weight = 0;
for (unsigned long int i = 0; i<vector_size; i++)

T current_value = vec[i] * weights[i];
numerator += current_value;
total_weight += weights[i];

//std::cout << "NUMERATOR: " << numerator << "n";
//std::cout << "DENOMINATOR: " << summation_of_weights << "n";
return numerator / total_weight;

;


//==========================================================================================
// FOR COMPUTING STANDARD DEVIATION
//==========================================================================================
template <typename T>
class standard_deviation : public mean<T>

private:
T mean_value;
T standard_deviation_value;
public:
T get_standard_deviation(std::vector<T> vec)

this->mean_value = this->get_mean(vec);
this->standard_deviation_value = 0;
for (unsigned long int i = 0; i<vec.size(); ++i)

T powered_value = (vec[i] - this->mean_value) * (vec[i] - this->mean_value);
this->standard_deviation_value += powered_value;

this->standard_deviation_value = sqrt(this->standard_deviation_value / vec.size());
return this->standard_deviation_value;

;


//==========================================================================================
// FOR COMPUTING INTERQUARTILE RANGE
//==========================================================================================
/*

CALCULATING INTERQUARTILE RANGE FOR ODD NO OF ELEMENTS:
-------------------------------------------------------

Given Elements(E) = 3,5,5,7,8,8,9,10,13,13,14,15,16,22,23

Step 1: Sort it! Here we already have an sorted values

E = 3, 5, 5, 7, 8, 8, 9, 10, 13, 13, 14, 15, 16, 22, 23

Step 2: Get the median for the values

Median (M) = 10

Step 3: The lower and the upper half

Elements which are left to the median are called left half and
to the right is right half

3 5 5 7 8 8 9 10 13 13 14 15 16 22 23
|___________| | |__________________|
LOWER HALF MEDIAN UPPER HALF

Step 4:
Q1 = median of left half
Q3 = median of right half

interquartile_range = Q3 - Q1

CALCULATING INTERQUARTILE RANGE FOR EVEN NO OF ELEMENTS:
--------------------------------------------------------
Step 1: Sort the array
Step 2: Get the median of the array
Step 3: Insert the median back to the array
Step 4: Follow odd no procedure since we have an array
of odd size.

*/
template <typename T>
class interquartile_range : public median<T>

private:
bool is_odd_vector;
public:
T get_interquartile_range(std::vector<T> vec, bool sorted = false)

if (sorted == false) std::sort(vec.begin(), vec.end());
if (vec.size() % 2 != 0) is_odd_vector = true;
else is_odd_vector = false;
if (is_odd_vector)

return compute(vec);

else

unsigned long int middle_index = vec.size() / 2;
T median_for_vector = this->get_median(vec);
vec.insert(vec.begin() + middle_index, median_for_vector);
return compute(vec);


private:
T compute(std::vector<T> vec)

unsigned long int middle_element_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i < middle_element_index; i++)

lower_half.push_back(vec[i]);

std::vector<T> upper_half;
for (unsigned long int i = middle_element_index + 1; i<vec.size(); i++)

upper_half.push_back(vec[i]);


T q1 = this->get_median(lower_half);
T q3 = this->get_median(upper_half);
return q3 - q1;

;


//================================================================================
// FREQUENCY MAP TO VECTOR CONVERTER
//================================================================================

template <typename T>
class frequency_map_converter

public:
void to_vector(std::map<T, unsigned long int> frequency_map, std::vector<T> &target_vector)

for (auto element : frequency_map)

for (unsigned long int i = 0; i < element.second; i++) target_vector.push_back(element.first);


;



//================================================================================
// FOR CALCULATING THE RANGE
//================================================================================

/*

HOW TO CALCULATE THE RANGE
--------------------------

sorted input vector = 1, 2, 3, 4
greatest_value = 4
least_value = 1
range = greatest_value - least_value
i.e range = 3
*/

template <typename T>
class range

public:
T get_range(std::vector<T> vec, bool sorted = false)

if (sorted == false) std::sort(vec.begin(), vec.end());
T greatest_value = vec[vec.size() - 1];
T least_value = vec[0];
return greatest_value - least_value;

;



//===============================================================================
// FOR CALCULATING THE QUARTILE
//===============================================================================

template <typename T>
class quartile : public median<T>

public:
std::map<std::string, T> get_quartile(std::vector<T> vec, bool sorted = false)

if (sorted == false) std::sort(vec.begin(), vec.end());
std::map<std::string, T> result;
result["q2"] = this->get_median(vec, sorted = true);
unsigned long int middle_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i<middle_index; i++)

lower_half.push_back(vec[i]);

result["q1"] = this->get_median(lower_half, sorted = true);
// free the memory by clearning the lower half
lower_half.clear();
std::vector<T> upper_half;
if (vec.size() % 2 != 0) middle_index++;
for (unsigned long int i = middle_index; i<vec.size(); i++)

upper_half.push_back(vec[i]);

result["q3"] = this->get_median(upper_half, sorted = true);
return result;

;


// ====================== TEXT TO NUMERICAL DATA CONVERTERS ===================

template <typename T>
class LabelEncoder

private:
long int current_encoded_value = -1;
std::vector<T> array;
std::map < T, long int> encoded_values;
public:
void fit(std::vector<T> array)

this->array = array;
std::sort(array.begin(), array.end());
std::vector<T> sorted_array = array;
for (auto i : sorted_array)

if (encoded_values.find(i) == encoded_values.end())
current_encoded_value++;
encoded_values[i] = current_encoded_value;




std::vector<long int> transform(std::vector<T> array)

std::vector<long int> transformed_array;
for (auto i : array)

transformed_array.push_back(encoded_values[i]);

return transformed_array;


std::vector<long int> transform()

return transform(this->array);

;

/*
Future Milestones - To implement mostlty used vectorizers from scikit-learn:
1. Implement CountVectorizer
2. Implement TfidfVectorizer
*/

// ============================ END OF TEXT TO NUMERICAL ENCODERS ==============================


// ============================ ACTIVATION FUNCTIONS ============================

template <typename T>
class activation_function

public:
T identity(T value)

return value;


long double sigmoid(T value)

T negative_value = -1 * value;
long double exponential = exp(negative_value);
long double result = 1 / (1 + exponential);
return result;


long double tan_h(T value)

long double pos_exp = exp(value);
long double neg_exp = exp(-1 * value);
return (pos_exp - neg_exp) / (pos_exp + neg_exp);


int threshold(T value)

if (value < 0) return 0;
else return 1;

;


This code passed all the unit-tests from the Hacker-rank for some of the statistics exercises.



Most of the ML libraries available in C++ are without documentation so it is hard to understand and implement. So I took this initiative. The main aim is to make is very easy for the end user like scikit-learn does. It is single header so it is easier to be Incorporated in existing projects.



License: Apache 2.0
Documentation: https://github.com/VISWESWARAN1998/statx







share|improve this question















  • 3




    you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
    – Vogel612♦
    Jul 14 at 17:15
















up vote
2
down vote

favorite












I am developing scikit-learn like implementation for C++ it is in the initial stage while developing I've started doubt myself that is this the correct implementation, since here accuracy is more important than robustness. I am still learning C++ and I would like an expert consultation since the development is at beginning stage I could fix and prevent future errors. Thank you



#include<iostream>
#include<vector>
#include<map>
#include<algorithm>
#include<typeinfo>
#include<cmath>

//===================================================================
// FOR COMPUTING MEAN
//===================================================================

template<typename T>
class mean

public:
T get_mean(std::vector<T> vec)

T total = 0;
for (auto i : vec) total += i;
auto average = total / vec.size();
return average;


;


//===================================================================
// FOR COMPUTING MEDIAN
//===================================================================
template<typename T>
class median

public:
T get_median(std::vector<T> vec, bool sorted = false)

if (sorted == false)std::sort(vec.begin(), vec.end());
auto vector_size = vec.size();
if (vector_size == 1) return vec[0];
// If the elements are odd return the middle element
if (vector_size % 2 != 0) return vec[vector_size / 2];
// If the elements count are even return the average of middle elements
auto middle_element_one = vector_size / 2;
auto middle_element_two = middle_element_one - 1;
auto result = (vec[middle_element_one] + vec[middle_element_two]) / 2;
return result;


;


//====================================================================
// FOR COMPUTING THE MODE
//====================================================================

template<typename T>
class mode

public:
T get_mode(std::vector<T> vec)

std::sort(vec.begin(), vec.end());
std::map<T, unsigned long int> number_count_table;
std::vector<T> elements_with_max_occurences;
unsigned long int bigger_no = 1;
for (auto i : vec)

if (number_count_table.find(i) == number_count_table.end()) number_count_table[i] = 1;
else
auto current_count = number_count_table[i];
current_count++;
if (current_count>bigger_no) bigger_no = current_count;
number_count_table[i] = current_count;


if (bigger_no == 1)
return vec[0];

else

for (auto itr : number_count_table)

if (itr.second == bigger_no) elements_with_max_occurences.push_back(itr.first);

std::sort(elements_with_max_occurences.begin(), elements_with_max_occurences.end());
return elements_with_max_occurences[0];


;



//========================================================================================
// FOR COMPUTING WEIGHTED MEAN
//========================================================================================

template <typename T>
class weighted_mean

private:
unsigned long int vector_size;
public:
T get_weighted_mean(std::vector<T> vec, std::vector<T> weights)

this->vector_size = vec.size();
T numerator = 0;
T total_weight = 0;
for (unsigned long int i = 0; i<vector_size; i++)

T current_value = vec[i] * weights[i];
numerator += current_value;
total_weight += weights[i];

//std::cout << "NUMERATOR: " << numerator << "n";
//std::cout << "DENOMINATOR: " << summation_of_weights << "n";
return numerator / total_weight;

;


//==========================================================================================
// FOR COMPUTING STANDARD DEVIATION
//==========================================================================================
template <typename T>
class standard_deviation : public mean<T>

private:
T mean_value;
T standard_deviation_value;
public:
T get_standard_deviation(std::vector<T> vec)

this->mean_value = this->get_mean(vec);
this->standard_deviation_value = 0;
for (unsigned long int i = 0; i<vec.size(); ++i)

T powered_value = (vec[i] - this->mean_value) * (vec[i] - this->mean_value);
this->standard_deviation_value += powered_value;

this->standard_deviation_value = sqrt(this->standard_deviation_value / vec.size());
return this->standard_deviation_value;

;


//==========================================================================================
// FOR COMPUTING INTERQUARTILE RANGE
//==========================================================================================
/*

CALCULATING INTERQUARTILE RANGE FOR ODD NO OF ELEMENTS:
-------------------------------------------------------

Given Elements(E) = 3,5,5,7,8,8,9,10,13,13,14,15,16,22,23

Step 1: Sort it! Here we already have an sorted values

E = 3, 5, 5, 7, 8, 8, 9, 10, 13, 13, 14, 15, 16, 22, 23

Step 2: Get the median for the values

Median (M) = 10

Step 3: The lower and the upper half

Elements which are left to the median are called left half and
to the right is right half

3 5 5 7 8 8 9 10 13 13 14 15 16 22 23
|___________| | |__________________|
LOWER HALF MEDIAN UPPER HALF

Step 4:
Q1 = median of left half
Q3 = median of right half

interquartile_range = Q3 - Q1

CALCULATING INTERQUARTILE RANGE FOR EVEN NO OF ELEMENTS:
--------------------------------------------------------
Step 1: Sort the array
Step 2: Get the median of the array
Step 3: Insert the median back to the array
Step 4: Follow odd no procedure since we have an array
of odd size.

*/
template <typename T>
class interquartile_range : public median<T>

private:
bool is_odd_vector;
public:
T get_interquartile_range(std::vector<T> vec, bool sorted = false)

if (sorted == false) std::sort(vec.begin(), vec.end());
if (vec.size() % 2 != 0) is_odd_vector = true;
else is_odd_vector = false;
if (is_odd_vector)

return compute(vec);

else

unsigned long int middle_index = vec.size() / 2;
T median_for_vector = this->get_median(vec);
vec.insert(vec.begin() + middle_index, median_for_vector);
return compute(vec);


private:
T compute(std::vector<T> vec)

unsigned long int middle_element_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i < middle_element_index; i++)

lower_half.push_back(vec[i]);

std::vector<T> upper_half;
for (unsigned long int i = middle_element_index + 1; i<vec.size(); i++)

upper_half.push_back(vec[i]);


T q1 = this->get_median(lower_half);
T q3 = this->get_median(upper_half);
return q3 - q1;

;


//================================================================================
// FREQUENCY MAP TO VECTOR CONVERTER
//================================================================================

template <typename T>
class frequency_map_converter

public:
void to_vector(std::map<T, unsigned long int> frequency_map, std::vector<T> &target_vector)

for (auto element : frequency_map)

for (unsigned long int i = 0; i < element.second; i++) target_vector.push_back(element.first);


;



//================================================================================
// FOR CALCULATING THE RANGE
//================================================================================

/*

HOW TO CALCULATE THE RANGE
--------------------------

sorted input vector = 1, 2, 3, 4
greatest_value = 4
least_value = 1
range = greatest_value - least_value
i.e range = 3
*/

template <typename T>
class range

public:
T get_range(std::vector<T> vec, bool sorted = false)

if (sorted == false) std::sort(vec.begin(), vec.end());
T greatest_value = vec[vec.size() - 1];
T least_value = vec[0];
return greatest_value - least_value;

;



//===============================================================================
// FOR CALCULATING THE QUARTILE
//===============================================================================

template <typename T>
class quartile : public median<T>

public:
std::map<std::string, T> get_quartile(std::vector<T> vec, bool sorted = false)

if (sorted == false) std::sort(vec.begin(), vec.end());
std::map<std::string, T> result;
result["q2"] = this->get_median(vec, sorted = true);
unsigned long int middle_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i<middle_index; i++)

lower_half.push_back(vec[i]);

result["q1"] = this->get_median(lower_half, sorted = true);
// free the memory by clearning the lower half
lower_half.clear();
std::vector<T> upper_half;
if (vec.size() % 2 != 0) middle_index++;
for (unsigned long int i = middle_index; i<vec.size(); i++)

upper_half.push_back(vec[i]);

result["q3"] = this->get_median(upper_half, sorted = true);
return result;

;


// ====================== TEXT TO NUMERICAL DATA CONVERTERS ===================

template <typename T>
class LabelEncoder

private:
long int current_encoded_value = -1;
std::vector<T> array;
std::map < T, long int> encoded_values;
public:
void fit(std::vector<T> array)

this->array = array;
std::sort(array.begin(), array.end());
std::vector<T> sorted_array = array;
for (auto i : sorted_array)

if (encoded_values.find(i) == encoded_values.end())
current_encoded_value++;
encoded_values[i] = current_encoded_value;




std::vector<long int> transform(std::vector<T> array)

std::vector<long int> transformed_array;
for (auto i : array)

transformed_array.push_back(encoded_values[i]);

return transformed_array;


std::vector<long int> transform()

return transform(this->array);

;

/*
Future Milestones - To implement mostlty used vectorizers from scikit-learn:
1. Implement CountVectorizer
2. Implement TfidfVectorizer
*/

// ============================ END OF TEXT TO NUMERICAL ENCODERS ==============================


// ============================ ACTIVATION FUNCTIONS ============================

template <typename T>
class activation_function

public:
T identity(T value)

return value;


long double sigmoid(T value)

T negative_value = -1 * value;
long double exponential = exp(negative_value);
long double result = 1 / (1 + exponential);
return result;


long double tan_h(T value)

long double pos_exp = exp(value);
long double neg_exp = exp(-1 * value);
return (pos_exp - neg_exp) / (pos_exp + neg_exp);


int threshold(T value)

if (value < 0) return 0;
else return 1;

;


This code passed all the unit-tests from the Hacker-rank for some of the statistics exercises.



Most of the ML libraries available in C++ are without documentation so it is hard to understand and implement. So I took this initiative. The main aim is to make is very easy for the end user like scikit-learn does. It is single header so it is easier to be Incorporated in existing projects.



License: Apache 2.0
Documentation: https://github.com/VISWESWARAN1998/statx







share|improve this question















  • 3




    you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
    – Vogel612♦
    Jul 14 at 17:15












up vote
2
down vote

favorite









up vote
2
down vote

favorite











I am developing scikit-learn like implementation for C++ it is in the initial stage while developing I've started doubt myself that is this the correct implementation, since here accuracy is more important than robustness. I am still learning C++ and I would like an expert consultation since the development is at beginning stage I could fix and prevent future errors. Thank you



#include<iostream>
#include<vector>
#include<map>
#include<algorithm>
#include<typeinfo>
#include<cmath>

//===================================================================
// FOR COMPUTING MEAN
//===================================================================

template<typename T>
class mean

public:
T get_mean(std::vector<T> vec)

T total = 0;
for (auto i : vec) total += i;
auto average = total / vec.size();
return average;


;


//===================================================================
// FOR COMPUTING MEDIAN
//===================================================================
template<typename T>
class median

public:
T get_median(std::vector<T> vec, bool sorted = false)

if (sorted == false)std::sort(vec.begin(), vec.end());
auto vector_size = vec.size();
if (vector_size == 1) return vec[0];
// If the elements are odd return the middle element
if (vector_size % 2 != 0) return vec[vector_size / 2];
// If the elements count are even return the average of middle elements
auto middle_element_one = vector_size / 2;
auto middle_element_two = middle_element_one - 1;
auto result = (vec[middle_element_one] + vec[middle_element_two]) / 2;
return result;


;


//====================================================================
// FOR COMPUTING THE MODE
//====================================================================

template<typename T>
class mode

public:
T get_mode(std::vector<T> vec)

std::sort(vec.begin(), vec.end());
std::map<T, unsigned long int> number_count_table;
std::vector<T> elements_with_max_occurences;
unsigned long int bigger_no = 1;
for (auto i : vec)

if (number_count_table.find(i) == number_count_table.end()) number_count_table[i] = 1;
else
auto current_count = number_count_table[i];
current_count++;
if (current_count>bigger_no) bigger_no = current_count;
number_count_table[i] = current_count;


if (bigger_no == 1)
return vec[0];

else

for (auto itr : number_count_table)

if (itr.second == bigger_no) elements_with_max_occurences.push_back(itr.first);

std::sort(elements_with_max_occurences.begin(), elements_with_max_occurences.end());
return elements_with_max_occurences[0];


;



//========================================================================================
// FOR COMPUTING WEIGHTED MEAN
//========================================================================================

template <typename T>
class weighted_mean

private:
unsigned long int vector_size;
public:
T get_weighted_mean(std::vector<T> vec, std::vector<T> weights)

this->vector_size = vec.size();
T numerator = 0;
T total_weight = 0;
for (unsigned long int i = 0; i<vector_size; i++)

T current_value = vec[i] * weights[i];
numerator += current_value;
total_weight += weights[i];

//std::cout << "NUMERATOR: " << numerator << "n";
//std::cout << "DENOMINATOR: " << summation_of_weights << "n";
return numerator / total_weight;

;


//==========================================================================================
// FOR COMPUTING STANDARD DEVIATION
//==========================================================================================
template <typename T>
class standard_deviation : public mean<T>

private:
T mean_value;
T standard_deviation_value;
public:
T get_standard_deviation(std::vector<T> vec)

this->mean_value = this->get_mean(vec);
this->standard_deviation_value = 0;
for (unsigned long int i = 0; i<vec.size(); ++i)

T powered_value = (vec[i] - this->mean_value) * (vec[i] - this->mean_value);
this->standard_deviation_value += powered_value;

this->standard_deviation_value = sqrt(this->standard_deviation_value / vec.size());
return this->standard_deviation_value;

;


//==========================================================================================
// FOR COMPUTING INTERQUARTILE RANGE
//==========================================================================================
/*

CALCULATING INTERQUARTILE RANGE FOR ODD NO OF ELEMENTS:
-------------------------------------------------------

Given Elements(E) = 3,5,5,7,8,8,9,10,13,13,14,15,16,22,23

Step 1: Sort it! Here we already have an sorted values

E = 3, 5, 5, 7, 8, 8, 9, 10, 13, 13, 14, 15, 16, 22, 23

Step 2: Get the median for the values

Median (M) = 10

Step 3: The lower and the upper half

Elements which are left to the median are called left half and
to the right is right half

3 5 5 7 8 8 9 10 13 13 14 15 16 22 23
|___________| | |__________________|
LOWER HALF MEDIAN UPPER HALF

Step 4:
Q1 = median of left half
Q3 = median of right half

interquartile_range = Q3 - Q1

CALCULATING INTERQUARTILE RANGE FOR EVEN NO OF ELEMENTS:
--------------------------------------------------------
Step 1: Sort the array
Step 2: Get the median of the array
Step 3: Insert the median back to the array
Step 4: Follow odd no procedure since we have an array
of odd size.

*/
template <typename T>
class interquartile_range : public median<T>

private:
bool is_odd_vector;
public:
T get_interquartile_range(std::vector<T> vec, bool sorted = false)

if (sorted == false) std::sort(vec.begin(), vec.end());
if (vec.size() % 2 != 0) is_odd_vector = true;
else is_odd_vector = false;
if (is_odd_vector)

return compute(vec);

else

unsigned long int middle_index = vec.size() / 2;
T median_for_vector = this->get_median(vec);
vec.insert(vec.begin() + middle_index, median_for_vector);
return compute(vec);


private:
T compute(std::vector<T> vec)

unsigned long int middle_element_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i < middle_element_index; i++)

lower_half.push_back(vec[i]);

std::vector<T> upper_half;
for (unsigned long int i = middle_element_index + 1; i<vec.size(); i++)

upper_half.push_back(vec[i]);


T q1 = this->get_median(lower_half);
T q3 = this->get_median(upper_half);
return q3 - q1;

;


//================================================================================
// FREQUENCY MAP TO VECTOR CONVERTER
//================================================================================

template <typename T>
class frequency_map_converter

public:
void to_vector(std::map<T, unsigned long int> frequency_map, std::vector<T> &target_vector)

for (auto element : frequency_map)

for (unsigned long int i = 0; i < element.second; i++) target_vector.push_back(element.first);


;



//================================================================================
// FOR CALCULATING THE RANGE
//================================================================================

/*

HOW TO CALCULATE THE RANGE
--------------------------

sorted input vector = 1, 2, 3, 4
greatest_value = 4
least_value = 1
range = greatest_value - least_value
i.e range = 3
*/

template <typename T>
class range

public:
T get_range(std::vector<T> vec, bool sorted = false)

if (sorted == false) std::sort(vec.begin(), vec.end());
T greatest_value = vec[vec.size() - 1];
T least_value = vec[0];
return greatest_value - least_value;

;



//===============================================================================
// FOR CALCULATING THE QUARTILE
//===============================================================================

template <typename T>
class quartile : public median<T>

public:
std::map<std::string, T> get_quartile(std::vector<T> vec, bool sorted = false)

if (sorted == false) std::sort(vec.begin(), vec.end());
std::map<std::string, T> result;
result["q2"] = this->get_median(vec, sorted = true);
unsigned long int middle_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i<middle_index; i++)

lower_half.push_back(vec[i]);

result["q1"] = this->get_median(lower_half, sorted = true);
// free the memory by clearning the lower half
lower_half.clear();
std::vector<T> upper_half;
if (vec.size() % 2 != 0) middle_index++;
for (unsigned long int i = middle_index; i<vec.size(); i++)

upper_half.push_back(vec[i]);

result["q3"] = this->get_median(upper_half, sorted = true);
return result;

;


// ====================== TEXT TO NUMERICAL DATA CONVERTERS ===================

template <typename T>
class LabelEncoder

private:
long int current_encoded_value = -1;
std::vector<T> array;
std::map < T, long int> encoded_values;
public:
void fit(std::vector<T> array)

this->array = array;
std::sort(array.begin(), array.end());
std::vector<T> sorted_array = array;
for (auto i : sorted_array)

if (encoded_values.find(i) == encoded_values.end())
current_encoded_value++;
encoded_values[i] = current_encoded_value;




std::vector<long int> transform(std::vector<T> array)

std::vector<long int> transformed_array;
for (auto i : array)

transformed_array.push_back(encoded_values[i]);

return transformed_array;


std::vector<long int> transform()

return transform(this->array);

;

/*
Future Milestones - To implement mostlty used vectorizers from scikit-learn:
1. Implement CountVectorizer
2. Implement TfidfVectorizer
*/

// ============================ END OF TEXT TO NUMERICAL ENCODERS ==============================


// ============================ ACTIVATION FUNCTIONS ============================

template <typename T>
class activation_function

public:
T identity(T value)

return value;


long double sigmoid(T value)

T negative_value = -1 * value;
long double exponential = exp(negative_value);
long double result = 1 / (1 + exponential);
return result;


long double tan_h(T value)

long double pos_exp = exp(value);
long double neg_exp = exp(-1 * value);
return (pos_exp - neg_exp) / (pos_exp + neg_exp);


int threshold(T value)

if (value < 0) return 0;
else return 1;

;


This code passed all the unit-tests from the Hacker-rank for some of the statistics exercises.



Most of the ML libraries available in C++ are without documentation so it is hard to understand and implement. So I took this initiative. The main aim is to make is very easy for the end user like scikit-learn does. It is single header so it is easier to be Incorporated in existing projects.



License: Apache 2.0
Documentation: https://github.com/VISWESWARAN1998/statx







share|improve this question











I am developing scikit-learn like implementation for C++ it is in the initial stage while developing I've started doubt myself that is this the correct implementation, since here accuracy is more important than robustness. I am still learning C++ and I would like an expert consultation since the development is at beginning stage I could fix and prevent future errors. Thank you



#include<iostream>
#include<vector>
#include<map>
#include<algorithm>
#include<typeinfo>
#include<cmath>

//===================================================================
// FOR COMPUTING MEAN
//===================================================================

template<typename T>
class mean

public:
T get_mean(std::vector<T> vec)

T total = 0;
for (auto i : vec) total += i;
auto average = total / vec.size();
return average;


;


//===================================================================
// FOR COMPUTING MEDIAN
//===================================================================
template<typename T>
class median

public:
T get_median(std::vector<T> vec, bool sorted = false)

if (sorted == false)std::sort(vec.begin(), vec.end());
auto vector_size = vec.size();
if (vector_size == 1) return vec[0];
// If the elements are odd return the middle element
if (vector_size % 2 != 0) return vec[vector_size / 2];
// If the elements count are even return the average of middle elements
auto middle_element_one = vector_size / 2;
auto middle_element_two = middle_element_one - 1;
auto result = (vec[middle_element_one] + vec[middle_element_two]) / 2;
return result;


;


//====================================================================
// FOR COMPUTING THE MODE
//====================================================================

template<typename T>
class mode

public:
T get_mode(std::vector<T> vec)

std::sort(vec.begin(), vec.end());
std::map<T, unsigned long int> number_count_table;
std::vector<T> elements_with_max_occurences;
unsigned long int bigger_no = 1;
for (auto i : vec)

if (number_count_table.find(i) == number_count_table.end()) number_count_table[i] = 1;
else
auto current_count = number_count_table[i];
current_count++;
if (current_count>bigger_no) bigger_no = current_count;
number_count_table[i] = current_count;


if (bigger_no == 1)
return vec[0];

else

for (auto itr : number_count_table)

if (itr.second == bigger_no) elements_with_max_occurences.push_back(itr.first);

std::sort(elements_with_max_occurences.begin(), elements_with_max_occurences.end());
return elements_with_max_occurences[0];


;



//========================================================================================
// FOR COMPUTING WEIGHTED MEAN
//========================================================================================

template <typename T>
class weighted_mean

private:
unsigned long int vector_size;
public:
T get_weighted_mean(std::vector<T> vec, std::vector<T> weights)

this->vector_size = vec.size();
T numerator = 0;
T total_weight = 0;
for (unsigned long int i = 0; i<vector_size; i++)

T current_value = vec[i] * weights[i];
numerator += current_value;
total_weight += weights[i];

//std::cout << "NUMERATOR: " << numerator << "n";
//std::cout << "DENOMINATOR: " << summation_of_weights << "n";
return numerator / total_weight;

;


//==========================================================================================
// FOR COMPUTING STANDARD DEVIATION
//==========================================================================================
template <typename T>
class standard_deviation : public mean<T>

private:
T mean_value;
T standard_deviation_value;
public:
T get_standard_deviation(std::vector<T> vec)

this->mean_value = this->get_mean(vec);
this->standard_deviation_value = 0;
for (unsigned long int i = 0; i<vec.size(); ++i)

T powered_value = (vec[i] - this->mean_value) * (vec[i] - this->mean_value);
this->standard_deviation_value += powered_value;

this->standard_deviation_value = sqrt(this->standard_deviation_value / vec.size());
return this->standard_deviation_value;

;


//==========================================================================================
// FOR COMPUTING INTERQUARTILE RANGE
//==========================================================================================
/*

CALCULATING INTERQUARTILE RANGE FOR ODD NO OF ELEMENTS:
-------------------------------------------------------

Given Elements(E) = 3,5,5,7,8,8,9,10,13,13,14,15,16,22,23

Step 1: Sort it! Here we already have an sorted values

E = 3, 5, 5, 7, 8, 8, 9, 10, 13, 13, 14, 15, 16, 22, 23

Step 2: Get the median for the values

Median (M) = 10

Step 3: The lower and the upper half

Elements which are left to the median are called left half and
to the right is right half

3 5 5 7 8 8 9 10 13 13 14 15 16 22 23
|___________| | |__________________|
LOWER HALF MEDIAN UPPER HALF

Step 4:
Q1 = median of left half
Q3 = median of right half

interquartile_range = Q3 - Q1

CALCULATING INTERQUARTILE RANGE FOR EVEN NO OF ELEMENTS:
--------------------------------------------------------
Step 1: Sort the array
Step 2: Get the median of the array
Step 3: Insert the median back to the array
Step 4: Follow odd no procedure since we have an array
of odd size.

*/
template <typename T>
class interquartile_range : public median<T>

private:
bool is_odd_vector;
public:
T get_interquartile_range(std::vector<T> vec, bool sorted = false)

if (sorted == false) std::sort(vec.begin(), vec.end());
if (vec.size() % 2 != 0) is_odd_vector = true;
else is_odd_vector = false;
if (is_odd_vector)

return compute(vec);

else

unsigned long int middle_index = vec.size() / 2;
T median_for_vector = this->get_median(vec);
vec.insert(vec.begin() + middle_index, median_for_vector);
return compute(vec);


private:
T compute(std::vector<T> vec)

unsigned long int middle_element_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i < middle_element_index; i++)

lower_half.push_back(vec[i]);

std::vector<T> upper_half;
for (unsigned long int i = middle_element_index + 1; i<vec.size(); i++)

upper_half.push_back(vec[i]);


T q1 = this->get_median(lower_half);
T q3 = this->get_median(upper_half);
return q3 - q1;

;


//================================================================================
// FREQUENCY MAP TO VECTOR CONVERTER
//================================================================================

template <typename T>
class frequency_map_converter

public:
void to_vector(std::map<T, unsigned long int> frequency_map, std::vector<T> &target_vector)

for (auto element : frequency_map)

for (unsigned long int i = 0; i < element.second; i++) target_vector.push_back(element.first);


;



//================================================================================
// FOR CALCULATING THE RANGE
//================================================================================

/*

HOW TO CALCULATE THE RANGE
--------------------------

sorted input vector = 1, 2, 3, 4
greatest_value = 4
least_value = 1
range = greatest_value - least_value
i.e range = 3
*/

template <typename T>
class range

public:
T get_range(std::vector<T> vec, bool sorted = false)

if (sorted == false) std::sort(vec.begin(), vec.end());
T greatest_value = vec[vec.size() - 1];
T least_value = vec[0];
return greatest_value - least_value;

;



//===============================================================================
// FOR CALCULATING THE QUARTILE
//===============================================================================

template <typename T>
class quartile : public median<T>

public:
std::map<std::string, T> get_quartile(std::vector<T> vec, bool sorted = false)

if (sorted == false) std::sort(vec.begin(), vec.end());
std::map<std::string, T> result;
result["q2"] = this->get_median(vec, sorted = true);
unsigned long int middle_index = vec.size() / 2;
std::vector<T> lower_half;
for (unsigned long int i = 0; i<middle_index; i++)

lower_half.push_back(vec[i]);

result["q1"] = this->get_median(lower_half, sorted = true);
// free the memory by clearning the lower half
lower_half.clear();
std::vector<T> upper_half;
if (vec.size() % 2 != 0) middle_index++;
for (unsigned long int i = middle_index; i<vec.size(); i++)

upper_half.push_back(vec[i]);

result["q3"] = this->get_median(upper_half, sorted = true);
return result;

;


// ====================== TEXT TO NUMERICAL DATA CONVERTERS ===================

template <typename T>
class LabelEncoder

private:
long int current_encoded_value = -1;
std::vector<T> array;
std::map < T, long int> encoded_values;
public:
void fit(std::vector<T> array)

this->array = array;
std::sort(array.begin(), array.end());
std::vector<T> sorted_array = array;
for (auto i : sorted_array)

if (encoded_values.find(i) == encoded_values.end())
current_encoded_value++;
encoded_values[i] = current_encoded_value;




std::vector<long int> transform(std::vector<T> array)

std::vector<long int> transformed_array;
for (auto i : array)

transformed_array.push_back(encoded_values[i]);

return transformed_array;


std::vector<long int> transform()

return transform(this->array);

;

/*
Future Milestones - To implement mostlty used vectorizers from scikit-learn:
1. Implement CountVectorizer
2. Implement TfidfVectorizer
*/

// ============================ END OF TEXT TO NUMERICAL ENCODERS ==============================


// ============================ ACTIVATION FUNCTIONS ============================

template <typename T>
class activation_function

public:
T identity(T value)

return value;


long double sigmoid(T value)

T negative_value = -1 * value;
long double exponential = exp(negative_value);
long double result = 1 / (1 + exponential);
return result;


long double tan_h(T value)

long double pos_exp = exp(value);
long double neg_exp = exp(-1 * value);
return (pos_exp - neg_exp) / (pos_exp + neg_exp);


int threshold(T value)

if (value < 0) return 0;
else return 1;

;


This code passed all the unit-tests from the Hacker-rank for some of the statistics exercises.



Most of the ML libraries available in C++ are without documentation so it is hard to understand and implement. So I took this initiative. The main aim is to make is very easy for the end user like scikit-learn does. It is single header so it is easier to be Incorporated in existing projects.



License: Apache 2.0
Documentation: https://github.com/VISWESWARAN1998/statx









share|improve this question










share|improve this question




share|improve this question









asked Jul 14 at 15:56









VISWESWARAN NAGASIVAM

192214




192214







  • 3




    you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
    – Vogel612♦
    Jul 14 at 17:15












  • 3




    you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
    – Vogel612♦
    Jul 14 at 17:15







3




3




you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
– Vogel612♦
Jul 14 at 17:15




you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
– Vogel612♦
Jul 14 at 17:15










1 Answer
1






active

oldest

votes

















up vote
4
down vote



accepted










Wrapper classes



I don't get why many algorithms (like get_mean, get_median, ...) are wrapped inside a class that is basically just a fancy namespace.



Why force the user to write mean.get_mean(...) if mean(...) would suffice?



Repetition



In many functions, there is a repeating pattern: The function takes a bool sorted parameter, and the first statement is if(!sorted) std::sort(...);.



First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.



Second, in many cases it isn't actually necessary to fully sort the std::vector ($mathcalO(n log n)$): There might be alternatives (like std::nth_element, $mathcalO(n)$) that can do the intended job and have better performance.



Last, why pass explicitly std::vector<T>? Not only does this force a copy of all the underlying data, many of those algorithms are general enough so that they should work on other data structures, too! Taking a pair of iterators also means that they can be applied to parts of containers (e.g. simplifying the quartile calculation).



For example, fixing these issues on median::get_medain



template<typename Iter>
auto median_of_sorted(Iter begin, Iter end)
auto size = std::distance(begin, end);

if(size % 2 != 0)
return *(begin + size / 2);


auto middle_low = *(begin + (size - 1) / 2);
auto middle_high = *(begin + (size + 1) / 2);

return (middle_low + middle_high) / 2;


template<typename Iter>
auto median(Iter begin, Iter end)
auto size = std::distance(begin, end);

if(size % 2 != 0)
auto middle = begin + size / 2;
std::nth_element(begin, middle, end);
return *middle;


auto middle_low = begin + (size - 1) / 2;
auto middle_high = begin + (size + 1) / 2;

std::nth_element(begin, middle_high, end);
std::nth_element(begin, middle_low, middle_high);

return (*middle_low + *middle_high) / 2;




median_of_sorted has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.



median has $mathcalO(n)$ runtime complexity and might modify the container.



Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.



Note: The code from median/median_of_sorted can easily be extended to calculate any percentile.




Other issues



  • unsigned long int might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please use uint32_t or uint64_t from the header <cstdint>. If the bit length itself is not relevant, maybe use size_t (or simply unsigned) instead to prevent confusion.


  • Do you need the ordering of std::map? If not, you might want to use std::unordered_map instead (hash map instead of binary search tree).


  • Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.






share|improve this answer























    Your Answer




    StackExchange.ifUsing("editor", function ()
    return StackExchange.using("mathjaxEditing", function ()
    StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
    StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
    );
    );
    , "mathjax-editing");

    StackExchange.ifUsing("editor", function ()
    StackExchange.using("externalEditor", function ()
    StackExchange.using("snippets", function ()
    StackExchange.snippets.init();
    );
    );
    , "code-snippets");

    StackExchange.ready(function()
    var channelOptions =
    tags: "".split(" "),
    id: "196"
    ;
    initTagRenderer("".split(" "), "".split(" "), channelOptions);

    StackExchange.using("externalEditor", function()
    // Have to fire editor after snippets, if snippets enabled
    if (StackExchange.settings.snippets.snippetsEnabled)
    StackExchange.using("snippets", function()
    createEditor();
    );

    else
    createEditor();

    );

    function createEditor()
    StackExchange.prepareEditor(
    heartbeatType: 'answer',
    convertImagesToLinks: false,
    noModals: false,
    showLowRepImageUploadWarning: true,
    reputationToPostImages: null,
    bindNavPrevention: true,
    postfix: "",
    onDemand: true,
    discardSelector: ".discard-answer"
    ,immediatelyShowMarkdownHelp:true
    );



    );








     

    draft saved


    draft discarded


















    StackExchange.ready(
    function ()
    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f199493%2fbasic-single-header-statistics-and-ml-libray-for-c-scikit-learn-like-impleme%23new-answer', 'question_page');

    );

    Post as a guest






























    1 Answer
    1






    active

    oldest

    votes








    1 Answer
    1






    active

    oldest

    votes









    active

    oldest

    votes






    active

    oldest

    votes








    up vote
    4
    down vote



    accepted










    Wrapper classes



    I don't get why many algorithms (like get_mean, get_median, ...) are wrapped inside a class that is basically just a fancy namespace.



    Why force the user to write mean.get_mean(...) if mean(...) would suffice?



    Repetition



    In many functions, there is a repeating pattern: The function takes a bool sorted parameter, and the first statement is if(!sorted) std::sort(...);.



    First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.



    Second, in many cases it isn't actually necessary to fully sort the std::vector ($mathcalO(n log n)$): There might be alternatives (like std::nth_element, $mathcalO(n)$) that can do the intended job and have better performance.



    Last, why pass explicitly std::vector<T>? Not only does this force a copy of all the underlying data, many of those algorithms are general enough so that they should work on other data structures, too! Taking a pair of iterators also means that they can be applied to parts of containers (e.g. simplifying the quartile calculation).



    For example, fixing these issues on median::get_medain



    template<typename Iter>
    auto median_of_sorted(Iter begin, Iter end)
    auto size = std::distance(begin, end);

    if(size % 2 != 0)
    return *(begin + size / 2);


    auto middle_low = *(begin + (size - 1) / 2);
    auto middle_high = *(begin + (size + 1) / 2);

    return (middle_low + middle_high) / 2;


    template<typename Iter>
    auto median(Iter begin, Iter end)
    auto size = std::distance(begin, end);

    if(size % 2 != 0)
    auto middle = begin + size / 2;
    std::nth_element(begin, middle, end);
    return *middle;


    auto middle_low = begin + (size - 1) / 2;
    auto middle_high = begin + (size + 1) / 2;

    std::nth_element(begin, middle_high, end);
    std::nth_element(begin, middle_low, middle_high);

    return (*middle_low + *middle_high) / 2;




    median_of_sorted has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.



    median has $mathcalO(n)$ runtime complexity and might modify the container.



    Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.



    Note: The code from median/median_of_sorted can easily be extended to calculate any percentile.




    Other issues



    • unsigned long int might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please use uint32_t or uint64_t from the header <cstdint>. If the bit length itself is not relevant, maybe use size_t (or simply unsigned) instead to prevent confusion.


    • Do you need the ordering of std::map? If not, you might want to use std::unordered_map instead (hash map instead of binary search tree).


    • Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.






    share|improve this answer



























      up vote
      4
      down vote



      accepted










      Wrapper classes



      I don't get why many algorithms (like get_mean, get_median, ...) are wrapped inside a class that is basically just a fancy namespace.



      Why force the user to write mean.get_mean(...) if mean(...) would suffice?



      Repetition



      In many functions, there is a repeating pattern: The function takes a bool sorted parameter, and the first statement is if(!sorted) std::sort(...);.



      First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.



      Second, in many cases it isn't actually necessary to fully sort the std::vector ($mathcalO(n log n)$): There might be alternatives (like std::nth_element, $mathcalO(n)$) that can do the intended job and have better performance.



      Last, why pass explicitly std::vector<T>? Not only does this force a copy of all the underlying data, many of those algorithms are general enough so that they should work on other data structures, too! Taking a pair of iterators also means that they can be applied to parts of containers (e.g. simplifying the quartile calculation).



      For example, fixing these issues on median::get_medain



      template<typename Iter>
      auto median_of_sorted(Iter begin, Iter end)
      auto size = std::distance(begin, end);

      if(size % 2 != 0)
      return *(begin + size / 2);


      auto middle_low = *(begin + (size - 1) / 2);
      auto middle_high = *(begin + (size + 1) / 2);

      return (middle_low + middle_high) / 2;


      template<typename Iter>
      auto median(Iter begin, Iter end)
      auto size = std::distance(begin, end);

      if(size % 2 != 0)
      auto middle = begin + size / 2;
      std::nth_element(begin, middle, end);
      return *middle;


      auto middle_low = begin + (size - 1) / 2;
      auto middle_high = begin + (size + 1) / 2;

      std::nth_element(begin, middle_high, end);
      std::nth_element(begin, middle_low, middle_high);

      return (*middle_low + *middle_high) / 2;




      median_of_sorted has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.



      median has $mathcalO(n)$ runtime complexity and might modify the container.



      Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.



      Note: The code from median/median_of_sorted can easily be extended to calculate any percentile.




      Other issues



      • unsigned long int might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please use uint32_t or uint64_t from the header <cstdint>. If the bit length itself is not relevant, maybe use size_t (or simply unsigned) instead to prevent confusion.


      • Do you need the ordering of std::map? If not, you might want to use std::unordered_map instead (hash map instead of binary search tree).


      • Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.






      share|improve this answer

























        up vote
        4
        down vote



        accepted







        up vote
        4
        down vote



        accepted






        Wrapper classes



        I don't get why many algorithms (like get_mean, get_median, ...) are wrapped inside a class that is basically just a fancy namespace.



        Why force the user to write mean.get_mean(...) if mean(...) would suffice?



        Repetition



        In many functions, there is a repeating pattern: The function takes a bool sorted parameter, and the first statement is if(!sorted) std::sort(...);.



        First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.



        Second, in many cases it isn't actually necessary to fully sort the std::vector ($mathcalO(n log n)$): There might be alternatives (like std::nth_element, $mathcalO(n)$) that can do the intended job and have better performance.



        Last, why pass explicitly std::vector<T>? Not only does this force a copy of all the underlying data, many of those algorithms are general enough so that they should work on other data structures, too! Taking a pair of iterators also means that they can be applied to parts of containers (e.g. simplifying the quartile calculation).



        For example, fixing these issues on median::get_medain



        template<typename Iter>
        auto median_of_sorted(Iter begin, Iter end)
        auto size = std::distance(begin, end);

        if(size % 2 != 0)
        return *(begin + size / 2);


        auto middle_low = *(begin + (size - 1) / 2);
        auto middle_high = *(begin + (size + 1) / 2);

        return (middle_low + middle_high) / 2;


        template<typename Iter>
        auto median(Iter begin, Iter end)
        auto size = std::distance(begin, end);

        if(size % 2 != 0)
        auto middle = begin + size / 2;
        std::nth_element(begin, middle, end);
        return *middle;


        auto middle_low = begin + (size - 1) / 2;
        auto middle_high = begin + (size + 1) / 2;

        std::nth_element(begin, middle_high, end);
        std::nth_element(begin, middle_low, middle_high);

        return (*middle_low + *middle_high) / 2;




        median_of_sorted has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.



        median has $mathcalO(n)$ runtime complexity and might modify the container.



        Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.



        Note: The code from median/median_of_sorted can easily be extended to calculate any percentile.




        Other issues



        • unsigned long int might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please use uint32_t or uint64_t from the header <cstdint>. If the bit length itself is not relevant, maybe use size_t (or simply unsigned) instead to prevent confusion.


        • Do you need the ordering of std::map? If not, you might want to use std::unordered_map instead (hash map instead of binary search tree).


        • Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.






        share|improve this answer















        Wrapper classes



        I don't get why many algorithms (like get_mean, get_median, ...) are wrapped inside a class that is basically just a fancy namespace.



        Why force the user to write mean.get_mean(...) if mean(...) would suffice?



        Repetition



        In many functions, there is a repeating pattern: The function takes a bool sorted parameter, and the first statement is if(!sorted) std::sort(...);.



        First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.



        Second, in many cases it isn't actually necessary to fully sort the std::vector ($mathcalO(n log n)$): There might be alternatives (like std::nth_element, $mathcalO(n)$) that can do the intended job and have better performance.



        Last, why pass explicitly std::vector<T>? Not only does this force a copy of all the underlying data, many of those algorithms are general enough so that they should work on other data structures, too! Taking a pair of iterators also means that they can be applied to parts of containers (e.g. simplifying the quartile calculation).



        For example, fixing these issues on median::get_medain



        template<typename Iter>
        auto median_of_sorted(Iter begin, Iter end)
        auto size = std::distance(begin, end);

        if(size % 2 != 0)
        return *(begin + size / 2);


        auto middle_low = *(begin + (size - 1) / 2);
        auto middle_high = *(begin + (size + 1) / 2);

        return (middle_low + middle_high) / 2;


        template<typename Iter>
        auto median(Iter begin, Iter end)
        auto size = std::distance(begin, end);

        if(size % 2 != 0)
        auto middle = begin + size / 2;
        std::nth_element(begin, middle, end);
        return *middle;


        auto middle_low = begin + (size - 1) / 2;
        auto middle_high = begin + (size + 1) / 2;

        std::nth_element(begin, middle_high, end);
        std::nth_element(begin, middle_low, middle_high);

        return (*middle_low + *middle_high) / 2;




        median_of_sorted has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.



        median has $mathcalO(n)$ runtime complexity and might modify the container.



        Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.



        Note: The code from median/median_of_sorted can easily be extended to calculate any percentile.




        Other issues



        • unsigned long int might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please use uint32_t or uint64_t from the header <cstdint>. If the bit length itself is not relevant, maybe use size_t (or simply unsigned) instead to prevent confusion.


        • Do you need the ordering of std::map? If not, you might want to use std::unordered_map instead (hash map instead of binary search tree).


        • Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.







        share|improve this answer















        share|improve this answer



        share|improve this answer








        edited Jul 16 at 16:07


























        answered Jul 14 at 17:16









        hoffmale

        4,205630




        4,205630






















             

            draft saved


            draft discarded


























             


            draft saved


            draft discarded














            StackExchange.ready(
            function ()
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f199493%2fbasic-single-header-statistics-and-ml-libray-for-c-scikit-learn-like-impleme%23new-answer', 'question_page');

            );

            Post as a guest













































































            Popular posts from this blog

            Chat program with C++ and SFML

            Function to Return a JSON Like Objects Using VBA Collections and Arrays

            Will my employers contract hold up in court?