Basic Single Header statistics and ml libray for C++ - Scikit-Learn like implementation

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
2
down vote

favorite

I am developing scikit-learn like implementation for C++ it is in the initial stage while developing I've started doubt myself that is this the correct implementation, since here accuracy is more important than robustness. I am still learning C++ and I would like an expert consultation since the development is at beginning stage I could fix and prevent future errors. Thank you

#include<iostream>
#include<vector>
#include<map>
#include<algorithm>
#include<typeinfo>
#include<cmath>

//===================================================================
// FOR COMPUTING MEAN
//===================================================================

template<typename T>
class mean

public:
 T get_mean(std::vector<T> vec)
 
 T total = 0;
 for (auto i : vec) total += i;
 auto average = total / vec.size();
 return average;
 

;


//===================================================================
// FOR COMPUTING MEDIAN
//===================================================================
template<typename T>
class median

public:
 T get_median(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false)std::sort(vec.begin(), vec.end());
 auto vector_size = vec.size();
 if (vector_size == 1) return vec[0];
 // If the elements are odd return the middle element
 if (vector_size % 2 != 0) return vec[vector_size / 2];
 // If the elements count are even return the average of middle elements
 auto middle_element_one = vector_size / 2;
 auto middle_element_two = middle_element_one - 1;
 auto result = (vec[middle_element_one] + vec[middle_element_two]) / 2;
 return result;

 
;


//====================================================================
// FOR COMPUTING THE MODE
//====================================================================

template<typename T>
class mode

public:
 T get_mode(std::vector<T> vec)
 
 std::sort(vec.begin(), vec.end());
 std::map<T, unsigned long int> number_count_table;
 std::vector<T> elements_with_max_occurences;
 unsigned long int bigger_no = 1;
 for (auto i : vec)
 
 if (number_count_table.find(i) == number_count_table.end()) number_count_table[i] = 1;
 else 
 auto current_count = number_count_table[i];
 current_count++;
 if (current_count>bigger_no) bigger_no = current_count;
 number_count_table[i] = current_count;
 
 
 if (bigger_no == 1) 
 return vec[0];
 
 else
 
 for (auto itr : number_count_table)
 
 if (itr.second == bigger_no) elements_with_max_occurences.push_back(itr.first);
 
 std::sort(elements_with_max_occurences.begin(), elements_with_max_occurences.end());
 return elements_with_max_occurences[0];
 
 
;



//========================================================================================
// FOR COMPUTING WEIGHTED MEAN
//========================================================================================

template <typename T>
class weighted_mean

private:
 unsigned long int vector_size;
public:
 T get_weighted_mean(std::vector<T> vec, std::vector<T> weights)
 
 this->vector_size = vec.size();
 T numerator = 0;
 T total_weight = 0;
 for (unsigned long int i = 0; i<vector_size; i++)
 
 T current_value = vec[i] * weights[i];
 numerator += current_value;
 total_weight += weights[i];
 
 //std::cout << "NUMERATOR: " << numerator << "n";
 //std::cout << "DENOMINATOR: " << summation_of_weights << "n";
 return numerator / total_weight;
 
;


//==========================================================================================
// FOR COMPUTING STANDARD DEVIATION
//==========================================================================================
template <typename T>
class standard_deviation : public mean<T>

private:
 T mean_value;
 T standard_deviation_value;
public:
 T get_standard_deviation(std::vector<T> vec)
 
 this->mean_value = this->get_mean(vec);
 this->standard_deviation_value = 0;
 for (unsigned long int i = 0; i<vec.size(); ++i)
 
 T powered_value = (vec[i] - this->mean_value) * (vec[i] - this->mean_value);
 this->standard_deviation_value += powered_value;
 
 this->standard_deviation_value = sqrt(this->standard_deviation_value / vec.size());
 return this->standard_deviation_value;
 
;


//==========================================================================================
// FOR COMPUTING INTERQUARTILE RANGE
//==========================================================================================
/*

CALCULATING INTERQUARTILE RANGE FOR ODD NO OF ELEMENTS:
-------------------------------------------------------

Given Elements(E) = 3,5,5,7,8,8,9,10,13,13,14,15,16,22,23

Step 1: Sort it! Here we already have an sorted values

E = 3, 5, 5, 7, 8, 8, 9, 10, 13, 13, 14, 15, 16, 22, 23 

Step 2: Get the median for the values

Median (M) = 10

Step 3: The lower and the upper half

Elements which are left to the median are called left half and
to the right is right half

3 5 5 7 8 8 9 10 13 13 14 15 16 22 23
|___________| | |__________________|
LOWER HALF MEDIAN UPPER HALF

Step 4:
Q1 = median of left half
Q3 = median of right half

interquartile_range = Q3 - Q1

CALCULATING INTERQUARTILE RANGE FOR EVEN NO OF ELEMENTS:
--------------------------------------------------------
Step 1: Sort the array
Step 2: Get the median of the array
Step 3: Insert the median back to the array
Step 4: Follow odd no procedure since we have an array
of odd size.

*/
template <typename T>
class interquartile_range : public median<T>

private:
 bool is_odd_vector;
public:
 T get_interquartile_range(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false) std::sort(vec.begin(), vec.end());
 if (vec.size() % 2 != 0) is_odd_vector = true;
 else is_odd_vector = false;
 if (is_odd_vector)
 
 return compute(vec);
 
 else
 
 unsigned long int middle_index = vec.size() / 2;
 T median_for_vector = this->get_median(vec);
 vec.insert(vec.begin() + middle_index, median_for_vector);
 return compute(vec);
 
 
private:
 T compute(std::vector<T> vec)
 
 unsigned long int middle_element_index = vec.size() / 2;
 std::vector<T> lower_half;
 for (unsigned long int i = 0; i < middle_element_index; i++)
 
 lower_half.push_back(vec[i]);
 
 std::vector<T> upper_half;
 for (unsigned long int i = middle_element_index + 1; i<vec.size(); i++)
 
 upper_half.push_back(vec[i]);
 

 T q1 = this->get_median(lower_half);
 T q3 = this->get_median(upper_half);
 return q3 - q1;
 
;


//================================================================================
// FREQUENCY MAP TO VECTOR CONVERTER
//================================================================================

template <typename T>
class frequency_map_converter

public:
 void to_vector(std::map<T, unsigned long int> frequency_map, std::vector<T> &target_vector)
 
 for (auto element : frequency_map)
 
 for (unsigned long int i = 0; i < element.second; i++) target_vector.push_back(element.first);
 
 
;



//================================================================================
// FOR CALCULATING THE RANGE
//================================================================================

/*

HOW TO CALCULATE THE RANGE
--------------------------

sorted input vector = 1, 2, 3, 4
greatest_value = 4
least_value = 1
range = greatest_value - least_value
i.e range = 3
*/

template <typename T>
class range

public:
 T get_range(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false) std::sort(vec.begin(), vec.end());
 T greatest_value = vec[vec.size() - 1];
 T least_value = vec[0];
 return greatest_value - least_value;
 
;



//===============================================================================
// FOR CALCULATING THE QUARTILE
//===============================================================================

template <typename T>
class quartile : public median<T>

public:
 std::map<std::string, T> get_quartile(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false) std::sort(vec.begin(), vec.end());
 std::map<std::string, T> result;
 result["q2"] = this->get_median(vec, sorted = true);
 unsigned long int middle_index = vec.size() / 2;
 std::vector<T> lower_half;
 for (unsigned long int i = 0; i<middle_index; i++)
 
 lower_half.push_back(vec[i]);
 
 result["q1"] = this->get_median(lower_half, sorted = true);
 // free the memory by clearning the lower half
 lower_half.clear();
 std::vector<T> upper_half;
 if (vec.size() % 2 != 0) middle_index++;
 for (unsigned long int i = middle_index; i<vec.size(); i++)
 
 upper_half.push_back(vec[i]);
 
 result["q3"] = this->get_median(upper_half, sorted = true);
 return result;
 
;


// ====================== TEXT TO NUMERICAL DATA CONVERTERS ===================

template <typename T>
class LabelEncoder

private:
 long int current_encoded_value = -1;
 std::vector<T> array;
 std::map < T, long int> encoded_values;
public:
 void fit(std::vector<T> array)
 
 this->array = array;
 std::sort(array.begin(), array.end());
 std::vector<T> sorted_array = array;
 for (auto i : sorted_array)
 
 if (encoded_values.find(i) == encoded_values.end()) 
 current_encoded_value++;
 encoded_values[i] = current_encoded_value;
 
 
 

 std::vector<long int> transform(std::vector<T> array)
 
 std::vector<long int> transformed_array;
 for (auto i : array)
 
 transformed_array.push_back(encoded_values[i]);
 
 return transformed_array;
 

 std::vector<long int> transform()
 
 return transform(this->array);
 
;

/*
Future Milestones - To implement mostlty used vectorizers from scikit-learn:
 1. Implement CountVectorizer
 2. Implement TfidfVectorizer
*/

// ============================ END OF TEXT TO NUMERICAL ENCODERS ==============================


// ============================ ACTIVATION FUNCTIONS ============================

template <typename T>
class activation_function

public:
 T identity(T value)
 
 return value;
 

 long double sigmoid(T value)
 
 T negative_value = -1 * value;
 long double exponential = exp(negative_value);
 long double result = 1 / (1 + exponential);
 return result;
 

 long double tan_h(T value)
 
 long double pos_exp = exp(value);
 long double neg_exp = exp(-1 * value);
 return (pos_exp - neg_exp) / (pos_exp + neg_exp);
 

 int threshold(T value)
 
 if (value < 0) return 0;
 else return 1;
 
;

This code passed all the unit-tests from the Hacker-rank for some of the statistics exercises.

Most of the ML libraries available in C++ are without documentation so it is hard to understand and implement. So I took this initiative. The main aim is to make is very easy for the end user like scikit-learn does. It is single header so it is easier to be Incorporated in existing projects.

License: Apache 2.0
Documentation: https://github.com/VISWESWARAN1998/statx

asked Jul 14 at 15:56

VISWESWARAN NAGASIVAM

192214

3

you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
â€“Â Vogel612â™¦
Jul 14 at 17:15

add a commentÂ |Â

up vote
2
down vote

favorite

#include<iostream>
#include<vector>
#include<map>
#include<algorithm>
#include<typeinfo>
#include<cmath>

//===================================================================
// FOR COMPUTING MEAN
//===================================================================

template<typename T>
class mean

public:
 T get_mean(std::vector<T> vec)
 
 T total = 0;
 for (auto i : vec) total += i;
 auto average = total / vec.size();
 return average;
 

;


//===================================================================
// FOR COMPUTING MEDIAN
//===================================================================
template<typename T>
class median

public:
 T get_median(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false)std::sort(vec.begin(), vec.end());
 auto vector_size = vec.size();
 if (vector_size == 1) return vec[0];
 // If the elements are odd return the middle element
 if (vector_size % 2 != 0) return vec[vector_size / 2];
 // If the elements count are even return the average of middle elements
 auto middle_element_one = vector_size / 2;
 auto middle_element_two = middle_element_one - 1;
 auto result = (vec[middle_element_one] + vec[middle_element_two]) / 2;
 return result;

 
;


//====================================================================
// FOR COMPUTING THE MODE
//====================================================================

template<typename T>
class mode

public:
 T get_mode(std::vector<T> vec)
 
 std::sort(vec.begin(), vec.end());
 std::map<T, unsigned long int> number_count_table;
 std::vector<T> elements_with_max_occurences;
 unsigned long int bigger_no = 1;
 for (auto i : vec)
 
 if (number_count_table.find(i) == number_count_table.end()) number_count_table[i] = 1;
 else 
 auto current_count = number_count_table[i];
 current_count++;
 if (current_count>bigger_no) bigger_no = current_count;
 number_count_table[i] = current_count;
 
 
 if (bigger_no == 1) 
 return vec[0];
 
 else
 
 for (auto itr : number_count_table)
 
 if (itr.second == bigger_no) elements_with_max_occurences.push_back(itr.first);
 
 std::sort(elements_with_max_occurences.begin(), elements_with_max_occurences.end());
 return elements_with_max_occurences[0];
 
 
;



//========================================================================================
// FOR COMPUTING WEIGHTED MEAN
//========================================================================================

template <typename T>
class weighted_mean

private:
 unsigned long int vector_size;
public:
 T get_weighted_mean(std::vector<T> vec, std::vector<T> weights)
 
 this->vector_size = vec.size();
 T numerator = 0;
 T total_weight = 0;
 for (unsigned long int i = 0; i<vector_size; i++)
 
 T current_value = vec[i] * weights[i];
 numerator += current_value;
 total_weight += weights[i];
 
 //std::cout << "NUMERATOR: " << numerator << "n";
 //std::cout << "DENOMINATOR: " << summation_of_weights << "n";
 return numerator / total_weight;
 
;


//==========================================================================================
// FOR COMPUTING STANDARD DEVIATION
//==========================================================================================
template <typename T>
class standard_deviation : public mean<T>

private:
 T mean_value;
 T standard_deviation_value;
public:
 T get_standard_deviation(std::vector<T> vec)
 
 this->mean_value = this->get_mean(vec);
 this->standard_deviation_value = 0;
 for (unsigned long int i = 0; i<vec.size(); ++i)
 
 T powered_value = (vec[i] - this->mean_value) * (vec[i] - this->mean_value);
 this->standard_deviation_value += powered_value;
 
 this->standard_deviation_value = sqrt(this->standard_deviation_value / vec.size());
 return this->standard_deviation_value;
 
;


//==========================================================================================
// FOR COMPUTING INTERQUARTILE RANGE
//==========================================================================================
/*

CALCULATING INTERQUARTILE RANGE FOR ODD NO OF ELEMENTS:
-------------------------------------------------------

Given Elements(E) = 3,5,5,7,8,8,9,10,13,13,14,15,16,22,23

Step 1: Sort it! Here we already have an sorted values

E = 3, 5, 5, 7, 8, 8, 9, 10, 13, 13, 14, 15, 16, 22, 23 

Step 2: Get the median for the values

Median (M) = 10

Step 3: The lower and the upper half

Elements which are left to the median are called left half and
to the right is right half

3 5 5 7 8 8 9 10 13 13 14 15 16 22 23
|___________| | |__________________|
LOWER HALF MEDIAN UPPER HALF

Step 4:
Q1 = median of left half
Q3 = median of right half

interquartile_range = Q3 - Q1

CALCULATING INTERQUARTILE RANGE FOR EVEN NO OF ELEMENTS:
--------------------------------------------------------
Step 1: Sort the array
Step 2: Get the median of the array
Step 3: Insert the median back to the array
Step 4: Follow odd no procedure since we have an array
of odd size.

*/
template <typename T>
class interquartile_range : public median<T>

private:
 bool is_odd_vector;
public:
 T get_interquartile_range(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false) std::sort(vec.begin(), vec.end());
 if (vec.size() % 2 != 0) is_odd_vector = true;
 else is_odd_vector = false;
 if (is_odd_vector)
 
 return compute(vec);
 
 else
 
 unsigned long int middle_index = vec.size() / 2;
 T median_for_vector = this->get_median(vec);
 vec.insert(vec.begin() + middle_index, median_for_vector);
 return compute(vec);
 
 
private:
 T compute(std::vector<T> vec)
 
 unsigned long int middle_element_index = vec.size() / 2;
 std::vector<T> lower_half;
 for (unsigned long int i = 0; i < middle_element_index; i++)
 
 lower_half.push_back(vec[i]);
 
 std::vector<T> upper_half;
 for (unsigned long int i = middle_element_index + 1; i<vec.size(); i++)
 
 upper_half.push_back(vec[i]);
 

 T q1 = this->get_median(lower_half);
 T q3 = this->get_median(upper_half);
 return q3 - q1;
 
;


//================================================================================
// FREQUENCY MAP TO VECTOR CONVERTER
//================================================================================

template <typename T>
class frequency_map_converter

public:
 void to_vector(std::map<T, unsigned long int> frequency_map, std::vector<T> &target_vector)
 
 for (auto element : frequency_map)
 
 for (unsigned long int i = 0; i < element.second; i++) target_vector.push_back(element.first);
 
 
;



//================================================================================
// FOR CALCULATING THE RANGE
//================================================================================

/*

HOW TO CALCULATE THE RANGE
--------------------------

sorted input vector = 1, 2, 3, 4
greatest_value = 4
least_value = 1
range = greatest_value - least_value
i.e range = 3
*/

template <typename T>
class range

public:
 T get_range(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false) std::sort(vec.begin(), vec.end());
 T greatest_value = vec[vec.size() - 1];
 T least_value = vec[0];
 return greatest_value - least_value;
 
;



//===============================================================================
// FOR CALCULATING THE QUARTILE
//===============================================================================

template <typename T>
class quartile : public median<T>

public:
 std::map<std::string, T> get_quartile(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false) std::sort(vec.begin(), vec.end());
 std::map<std::string, T> result;
 result["q2"] = this->get_median(vec, sorted = true);
 unsigned long int middle_index = vec.size() / 2;
 std::vector<T> lower_half;
 for (unsigned long int i = 0; i<middle_index; i++)
 
 lower_half.push_back(vec[i]);
 
 result["q1"] = this->get_median(lower_half, sorted = true);
 // free the memory by clearning the lower half
 lower_half.clear();
 std::vector<T> upper_half;
 if (vec.size() % 2 != 0) middle_index++;
 for (unsigned long int i = middle_index; i<vec.size(); i++)
 
 upper_half.push_back(vec[i]);
 
 result["q3"] = this->get_median(upper_half, sorted = true);
 return result;
 
;


// ====================== TEXT TO NUMERICAL DATA CONVERTERS ===================

template <typename T>
class LabelEncoder

private:
 long int current_encoded_value = -1;
 std::vector<T> array;
 std::map < T, long int> encoded_values;
public:
 void fit(std::vector<T> array)
 
 this->array = array;
 std::sort(array.begin(), array.end());
 std::vector<T> sorted_array = array;
 for (auto i : sorted_array)
 
 if (encoded_values.find(i) == encoded_values.end()) 
 current_encoded_value++;
 encoded_values[i] = current_encoded_value;
 
 
 

 std::vector<long int> transform(std::vector<T> array)
 
 std::vector<long int> transformed_array;
 for (auto i : array)
 
 transformed_array.push_back(encoded_values[i]);
 
 return transformed_array;
 

 std::vector<long int> transform()
 
 return transform(this->array);
 
;

/*
Future Milestones - To implement mostlty used vectorizers from scikit-learn:
 1. Implement CountVectorizer
 2. Implement TfidfVectorizer
*/

// ============================ END OF TEXT TO NUMERICAL ENCODERS ==============================


// ============================ ACTIVATION FUNCTIONS ============================

template <typename T>
class activation_function

public:
 T identity(T value)
 
 return value;
 

 long double sigmoid(T value)
 
 T negative_value = -1 * value;
 long double exponential = exp(negative_value);
 long double result = 1 / (1 + exponential);
 return result;
 

 long double tan_h(T value)
 
 long double pos_exp = exp(value);
 long double neg_exp = exp(-1 * value);
 return (pos_exp - neg_exp) / (pos_exp + neg_exp);
 

 int threshold(T value)
 
 if (value < 0) return 0;
 else return 1;
 
;

This code passed all the unit-tests from the Hacker-rank for some of the statistics exercises.

License: Apache 2.0
Documentation: https://github.com/VISWESWARAN1998/statx

asked Jul 14 at 15:56

VISWESWARAN NAGASIVAM

192214

3

you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
â€“Â Vogel612â™¦
Jul 14 at 17:15

add a commentÂ |Â

up vote
2
down vote

favorite

#include<iostream>
#include<vector>
#include<map>
#include<algorithm>
#include<typeinfo>
#include<cmath>

//===================================================================
// FOR COMPUTING MEAN
//===================================================================

template<typename T>
class mean

public:
 T get_mean(std::vector<T> vec)
 
 T total = 0;
 for (auto i : vec) total += i;
 auto average = total / vec.size();
 return average;
 

;


//===================================================================
// FOR COMPUTING MEDIAN
//===================================================================
template<typename T>
class median

public:
 T get_median(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false)std::sort(vec.begin(), vec.end());
 auto vector_size = vec.size();
 if (vector_size == 1) return vec[0];
 // If the elements are odd return the middle element
 if (vector_size % 2 != 0) return vec[vector_size / 2];
 // If the elements count are even return the average of middle elements
 auto middle_element_one = vector_size / 2;
 auto middle_element_two = middle_element_one - 1;
 auto result = (vec[middle_element_one] + vec[middle_element_two]) / 2;
 return result;

 
;


//====================================================================
// FOR COMPUTING THE MODE
//====================================================================

template<typename T>
class mode

public:
 T get_mode(std::vector<T> vec)
 
 std::sort(vec.begin(), vec.end());
 std::map<T, unsigned long int> number_count_table;
 std::vector<T> elements_with_max_occurences;
 unsigned long int bigger_no = 1;
 for (auto i : vec)
 
 if (number_count_table.find(i) == number_count_table.end()) number_count_table[i] = 1;
 else 
 auto current_count = number_count_table[i];
 current_count++;
 if (current_count>bigger_no) bigger_no = current_count;
 number_count_table[i] = current_count;
 
 
 if (bigger_no == 1) 
 return vec[0];
 
 else
 
 for (auto itr : number_count_table)
 
 if (itr.second == bigger_no) elements_with_max_occurences.push_back(itr.first);
 
 std::sort(elements_with_max_occurences.begin(), elements_with_max_occurences.end());
 return elements_with_max_occurences[0];
 
 
;



//========================================================================================
// FOR COMPUTING WEIGHTED MEAN
//========================================================================================

template <typename T>
class weighted_mean

private:
 unsigned long int vector_size;
public:
 T get_weighted_mean(std::vector<T> vec, std::vector<T> weights)
 
 this->vector_size = vec.size();
 T numerator = 0;
 T total_weight = 0;
 for (unsigned long int i = 0; i<vector_size; i++)
 
 T current_value = vec[i] * weights[i];
 numerator += current_value;
 total_weight += weights[i];
 
 //std::cout << "NUMERATOR: " << numerator << "n";
 //std::cout << "DENOMINATOR: " << summation_of_weights << "n";
 return numerator / total_weight;
 
;


//==========================================================================================
// FOR COMPUTING STANDARD DEVIATION
//==========================================================================================
template <typename T>
class standard_deviation : public mean<T>

private:
 T mean_value;
 T standard_deviation_value;
public:
 T get_standard_deviation(std::vector<T> vec)
 
 this->mean_value = this->get_mean(vec);
 this->standard_deviation_value = 0;
 for (unsigned long int i = 0; i<vec.size(); ++i)
 
 T powered_value = (vec[i] - this->mean_value) * (vec[i] - this->mean_value);
 this->standard_deviation_value += powered_value;
 
 this->standard_deviation_value = sqrt(this->standard_deviation_value / vec.size());
 return this->standard_deviation_value;
 
;


//==========================================================================================
// FOR COMPUTING INTERQUARTILE RANGE
//==========================================================================================
/*

CALCULATING INTERQUARTILE RANGE FOR ODD NO OF ELEMENTS:
-------------------------------------------------------

Given Elements(E) = 3,5,5,7,8,8,9,10,13,13,14,15,16,22,23

Step 1: Sort it! Here we already have an sorted values

E = 3, 5, 5, 7, 8, 8, 9, 10, 13, 13, 14, 15, 16, 22, 23 

Step 2: Get the median for the values

Median (M) = 10

Step 3: The lower and the upper half

Elements which are left to the median are called left half and
to the right is right half

3 5 5 7 8 8 9 10 13 13 14 15 16 22 23
|___________| | |__________________|
LOWER HALF MEDIAN UPPER HALF

Step 4:
Q1 = median of left half
Q3 = median of right half

interquartile_range = Q3 - Q1

CALCULATING INTERQUARTILE RANGE FOR EVEN NO OF ELEMENTS:
--------------------------------------------------------
Step 1: Sort the array
Step 2: Get the median of the array
Step 3: Insert the median back to the array
Step 4: Follow odd no procedure since we have an array
of odd size.

*/
template <typename T>
class interquartile_range : public median<T>

private:
 bool is_odd_vector;
public:
 T get_interquartile_range(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false) std::sort(vec.begin(), vec.end());
 if (vec.size() % 2 != 0) is_odd_vector = true;
 else is_odd_vector = false;
 if (is_odd_vector)
 
 return compute(vec);
 
 else
 
 unsigned long int middle_index = vec.size() / 2;
 T median_for_vector = this->get_median(vec);
 vec.insert(vec.begin() + middle_index, median_for_vector);
 return compute(vec);
 
 
private:
 T compute(std::vector<T> vec)
 
 unsigned long int middle_element_index = vec.size() / 2;
 std::vector<T> lower_half;
 for (unsigned long int i = 0; i < middle_element_index; i++)
 
 lower_half.push_back(vec[i]);
 
 std::vector<T> upper_half;
 for (unsigned long int i = middle_element_index + 1; i<vec.size(); i++)
 
 upper_half.push_back(vec[i]);
 

 T q1 = this->get_median(lower_half);
 T q3 = this->get_median(upper_half);
 return q3 - q1;
 
;


//================================================================================
// FREQUENCY MAP TO VECTOR CONVERTER
//================================================================================

template <typename T>
class frequency_map_converter

public:
 void to_vector(std::map<T, unsigned long int> frequency_map, std::vector<T> &target_vector)
 
 for (auto element : frequency_map)
 
 for (unsigned long int i = 0; i < element.second; i++) target_vector.push_back(element.first);
 
 
;



//================================================================================
// FOR CALCULATING THE RANGE
//================================================================================

/*

HOW TO CALCULATE THE RANGE
--------------------------

sorted input vector = 1, 2, 3, 4
greatest_value = 4
least_value = 1
range = greatest_value - least_value
i.e range = 3
*/

template <typename T>
class range

public:
 T get_range(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false) std::sort(vec.begin(), vec.end());
 T greatest_value = vec[vec.size() - 1];
 T least_value = vec[0];
 return greatest_value - least_value;
 
;



//===============================================================================
// FOR CALCULATING THE QUARTILE
//===============================================================================

template <typename T>
class quartile : public median<T>

public:
 std::map<std::string, T> get_quartile(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false) std::sort(vec.begin(), vec.end());
 std::map<std::string, T> result;
 result["q2"] = this->get_median(vec, sorted = true);
 unsigned long int middle_index = vec.size() / 2;
 std::vector<T> lower_half;
 for (unsigned long int i = 0; i<middle_index; i++)
 
 lower_half.push_back(vec[i]);
 
 result["q1"] = this->get_median(lower_half, sorted = true);
 // free the memory by clearning the lower half
 lower_half.clear();
 std::vector<T> upper_half;
 if (vec.size() % 2 != 0) middle_index++;
 for (unsigned long int i = middle_index; i<vec.size(); i++)
 
 upper_half.push_back(vec[i]);
 
 result["q3"] = this->get_median(upper_half, sorted = true);
 return result;
 
;


// ====================== TEXT TO NUMERICAL DATA CONVERTERS ===================

template <typename T>
class LabelEncoder

private:
 long int current_encoded_value = -1;
 std::vector<T> array;
 std::map < T, long int> encoded_values;
public:
 void fit(std::vector<T> array)
 
 this->array = array;
 std::sort(array.begin(), array.end());
 std::vector<T> sorted_array = array;
 for (auto i : sorted_array)
 
 if (encoded_values.find(i) == encoded_values.end()) 
 current_encoded_value++;
 encoded_values[i] = current_encoded_value;
 
 
 

 std::vector<long int> transform(std::vector<T> array)
 
 std::vector<long int> transformed_array;
 for (auto i : array)
 
 transformed_array.push_back(encoded_values[i]);
 
 return transformed_array;
 

 std::vector<long int> transform()
 
 return transform(this->array);
 
;

/*
Future Milestones - To implement mostlty used vectorizers from scikit-learn:
 1. Implement CountVectorizer
 2. Implement TfidfVectorizer
*/

// ============================ END OF TEXT TO NUMERICAL ENCODERS ==============================


// ============================ ACTIVATION FUNCTIONS ============================

template <typename T>
class activation_function

public:
 T identity(T value)
 
 return value;
 

 long double sigmoid(T value)
 
 T negative_value = -1 * value;
 long double exponential = exp(negative_value);
 long double result = 1 / (1 + exponential);
 return result;
 

 long double tan_h(T value)
 
 long double pos_exp = exp(value);
 long double neg_exp = exp(-1 * value);
 return (pos_exp - neg_exp) / (pos_exp + neg_exp);
 

 int threshold(T value)
 
 if (value < 0) return 0;
 else return 1;
 
;

This code passed all the unit-tests from the Hacker-rank for some of the statistics exercises.

License: Apache 2.0
Documentation: https://github.com/VISWESWARAN1998/statx

asked Jul 14 at 15:56

VISWESWARAN NAGASIVAM

192214

#include<iostream>
#include<vector>
#include<map>
#include<algorithm>
#include<typeinfo>
#include<cmath>

//===================================================================
// FOR COMPUTING MEAN
//===================================================================

template<typename T>
class mean

public:
 T get_mean(std::vector<T> vec)
 
 T total = 0;
 for (auto i : vec) total += i;
 auto average = total / vec.size();
 return average;
 

;


//===================================================================
// FOR COMPUTING MEDIAN
//===================================================================
template<typename T>
class median

public:
 T get_median(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false)std::sort(vec.begin(), vec.end());
 auto vector_size = vec.size();
 if (vector_size == 1) return vec[0];
 // If the elements are odd return the middle element
 if (vector_size % 2 != 0) return vec[vector_size / 2];
 // If the elements count are even return the average of middle elements
 auto middle_element_one = vector_size / 2;
 auto middle_element_two = middle_element_one - 1;
 auto result = (vec[middle_element_one] + vec[middle_element_two]) / 2;
 return result;

 
;


//====================================================================
// FOR COMPUTING THE MODE
//====================================================================

template<typename T>
class mode

public:
 T get_mode(std::vector<T> vec)
 
 std::sort(vec.begin(), vec.end());
 std::map<T, unsigned long int> number_count_table;
 std::vector<T> elements_with_max_occurences;
 unsigned long int bigger_no = 1;
 for (auto i : vec)
 
 if (number_count_table.find(i) == number_count_table.end()) number_count_table[i] = 1;
 else 
 auto current_count = number_count_table[i];
 current_count++;
 if (current_count>bigger_no) bigger_no = current_count;
 number_count_table[i] = current_count;
 
 
 if (bigger_no == 1) 
 return vec[0];
 
 else
 
 for (auto itr : number_count_table)
 
 if (itr.second == bigger_no) elements_with_max_occurences.push_back(itr.first);
 
 std::sort(elements_with_max_occurences.begin(), elements_with_max_occurences.end());
 return elements_with_max_occurences[0];
 
 
;



//========================================================================================
// FOR COMPUTING WEIGHTED MEAN
//========================================================================================

template <typename T>
class weighted_mean

private:
 unsigned long int vector_size;
public:
 T get_weighted_mean(std::vector<T> vec, std::vector<T> weights)
 
 this->vector_size = vec.size();
 T numerator = 0;
 T total_weight = 0;
 for (unsigned long int i = 0; i<vector_size; i++)
 
 T current_value = vec[i] * weights[i];
 numerator += current_value;
 total_weight += weights[i];
 
 //std::cout << "NUMERATOR: " << numerator << "n";
 //std::cout << "DENOMINATOR: " << summation_of_weights << "n";
 return numerator / total_weight;
 
;


//==========================================================================================
// FOR COMPUTING STANDARD DEVIATION
//==========================================================================================
template <typename T>
class standard_deviation : public mean<T>

private:
 T mean_value;
 T standard_deviation_value;
public:
 T get_standard_deviation(std::vector<T> vec)
 
 this->mean_value = this->get_mean(vec);
 this->standard_deviation_value = 0;
 for (unsigned long int i = 0; i<vec.size(); ++i)
 
 T powered_value = (vec[i] - this->mean_value) * (vec[i] - this->mean_value);
 this->standard_deviation_value += powered_value;
 
 this->standard_deviation_value = sqrt(this->standard_deviation_value / vec.size());
 return this->standard_deviation_value;
 
;


//==========================================================================================
// FOR COMPUTING INTERQUARTILE RANGE
//==========================================================================================
/*

CALCULATING INTERQUARTILE RANGE FOR ODD NO OF ELEMENTS:
-------------------------------------------------------

Given Elements(E) = 3,5,5,7,8,8,9,10,13,13,14,15,16,22,23

Step 1: Sort it! Here we already have an sorted values

E = 3, 5, 5, 7, 8, 8, 9, 10, 13, 13, 14, 15, 16, 22, 23 

Step 2: Get the median for the values

Median (M) = 10

Step 3: The lower and the upper half

Elements which are left to the median are called left half and
to the right is right half

3 5 5 7 8 8 9 10 13 13 14 15 16 22 23
|___________| | |__________________|
LOWER HALF MEDIAN UPPER HALF

Step 4:
Q1 = median of left half
Q3 = median of right half

interquartile_range = Q3 - Q1

CALCULATING INTERQUARTILE RANGE FOR EVEN NO OF ELEMENTS:
--------------------------------------------------------
Step 1: Sort the array
Step 2: Get the median of the array
Step 3: Insert the median back to the array
Step 4: Follow odd no procedure since we have an array
of odd size.

*/
template <typename T>
class interquartile_range : public median<T>

private:
 bool is_odd_vector;
public:
 T get_interquartile_range(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false) std::sort(vec.begin(), vec.end());
 if (vec.size() % 2 != 0) is_odd_vector = true;
 else is_odd_vector = false;
 if (is_odd_vector)
 
 return compute(vec);
 
 else
 
 unsigned long int middle_index = vec.size() / 2;
 T median_for_vector = this->get_median(vec);
 vec.insert(vec.begin() + middle_index, median_for_vector);
 return compute(vec);
 
 
private:
 T compute(std::vector<T> vec)
 
 unsigned long int middle_element_index = vec.size() / 2;
 std::vector<T> lower_half;
 for (unsigned long int i = 0; i < middle_element_index; i++)
 
 lower_half.push_back(vec[i]);
 
 std::vector<T> upper_half;
 for (unsigned long int i = middle_element_index + 1; i<vec.size(); i++)
 
 upper_half.push_back(vec[i]);
 

 T q1 = this->get_median(lower_half);
 T q3 = this->get_median(upper_half);
 return q3 - q1;
 
;


//================================================================================
// FREQUENCY MAP TO VECTOR CONVERTER
//================================================================================

template <typename T>
class frequency_map_converter

public:
 void to_vector(std::map<T, unsigned long int> frequency_map, std::vector<T> &target_vector)
 
 for (auto element : frequency_map)
 
 for (unsigned long int i = 0; i < element.second; i++) target_vector.push_back(element.first);
 
 
;



//================================================================================
// FOR CALCULATING THE RANGE
//================================================================================

/*

HOW TO CALCULATE THE RANGE
--------------------------

sorted input vector = 1, 2, 3, 4
greatest_value = 4
least_value = 1
range = greatest_value - least_value
i.e range = 3
*/

template <typename T>
class range

public:
 T get_range(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false) std::sort(vec.begin(), vec.end());
 T greatest_value = vec[vec.size() - 1];
 T least_value = vec[0];
 return greatest_value - least_value;
 
;



//===============================================================================
// FOR CALCULATING THE QUARTILE
//===============================================================================

template <typename T>
class quartile : public median<T>

public:
 std::map<std::string, T> get_quartile(std::vector<T> vec, bool sorted = false)
 
 if (sorted == false) std::sort(vec.begin(), vec.end());
 std::map<std::string, T> result;
 result["q2"] = this->get_median(vec, sorted = true);
 unsigned long int middle_index = vec.size() / 2;
 std::vector<T> lower_half;
 for (unsigned long int i = 0; i<middle_index; i++)
 
 lower_half.push_back(vec[i]);
 
 result["q1"] = this->get_median(lower_half, sorted = true);
 // free the memory by clearning the lower half
 lower_half.clear();
 std::vector<T> upper_half;
 if (vec.size() % 2 != 0) middle_index++;
 for (unsigned long int i = middle_index; i<vec.size(); i++)
 
 upper_half.push_back(vec[i]);
 
 result["q3"] = this->get_median(upper_half, sorted = true);
 return result;
 
;


// ====================== TEXT TO NUMERICAL DATA CONVERTERS ===================

template <typename T>
class LabelEncoder

private:
 long int current_encoded_value = -1;
 std::vector<T> array;
 std::map < T, long int> encoded_values;
public:
 void fit(std::vector<T> array)
 
 this->array = array;
 std::sort(array.begin(), array.end());
 std::vector<T> sorted_array = array;
 for (auto i : sorted_array)
 
 if (encoded_values.find(i) == encoded_values.end()) 
 current_encoded_value++;
 encoded_values[i] = current_encoded_value;
 
 
 

 std::vector<long int> transform(std::vector<T> array)
 
 std::vector<long int> transformed_array;
 for (auto i : array)
 
 transformed_array.push_back(encoded_values[i]);
 
 return transformed_array;
 

 std::vector<long int> transform()
 
 return transform(this->array);
 
;

/*
Future Milestones - To implement mostlty used vectorizers from scikit-learn:
 1. Implement CountVectorizer
 2. Implement TfidfVectorizer
*/

// ============================ END OF TEXT TO NUMERICAL ENCODERS ==============================


// ============================ ACTIVATION FUNCTIONS ============================

template <typename T>
class activation_function

public:
 T identity(T value)
 
 return value;
 

 long double sigmoid(T value)
 
 T negative_value = -1 * value;
 long double exponential = exp(negative_value);
 long double result = 1 / (1 + exponential);
 return result;
 

 long double tan_h(T value)
 
 long double pos_exp = exp(value);
 long double neg_exp = exp(-1 * value);
 return (pos_exp - neg_exp) / (pos_exp + neg_exp);
 

 int threshold(T value)
 
 if (value < 0) return 0;
 else return 1;
 
;

This code passed all the unit-tests from the Hacker-rank for some of the statistics exercises.

License: Apache 2.0
Documentation: https://github.com/VISWESWARAN1998/statx

asked Jul 14 at 15:56

VISWESWARAN NAGASIVAM

192214

asked Jul 14 at 15:56

VISWESWARAN NAGASIVAM

192214

asked Jul 14 at 15:56

VISWESWARAN NAGASIVAM

192214

asked Jul 14 at 15:56

VISWESWARAN NAGASIVAM

192214

3

you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
â€“Â Vogel612â™¦
Jul 14 at 17:15

add a commentÂ |Â

3

you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
â€“Â Vogel612â™¦
Jul 14 at 17:15

you are aware that even with the last sentence in the question, this is licensed under CC-BY-SA-3.0 right?
â€“Â Vogel612â™¦
Jul 14 at 17:15

add a commentÂ |Â

1 Answer
1

active

oldest

votes

up vote
4
down vote

accepted

Wrapper classes

I don't get why many algorithms (like get_mean, get_median, ...) are wrapped inside a class that is basically just a fancy namespace.

Why force the user to write mean.get_mean(...) if mean(...) would suffice?

Repetition

In many functions, there is a repeating pattern: The function takes a bool sorted parameter, and the first statement is if(!sorted) std::sort(...);.

First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.

Second, in many cases it isn't actually necessary to fully sort the std::vector ($mathcalO(n log n)$): There might be alternatives (like std::nth_element, $mathcalO(n)$) that can do the intended job and have better performance.

Last, why pass explicitly std::vector<T>? Not only does this force a copy of all the underlying data, many of those algorithms are general enough so that they should work on other data structures, too! Taking a pair of iterators also means that they can be applied to parts of containers (e.g. simplifying the quartile calculation).

For example, fixing these issues on median::get_medain

template<typename Iter>
auto median_of_sorted(Iter begin, Iter end) 
 auto size = std::distance(begin, end);

 if(size % 2 != 0) 
 return *(begin + size / 2);
 

 auto middle_low = *(begin + (size - 1) / 2);
 auto middle_high = *(begin + (size + 1) / 2);

 return (middle_low + middle_high) / 2;


template<typename Iter>
auto median(Iter begin, Iter end) 
 auto size = std::distance(begin, end);

 if(size % 2 != 0) 
 auto middle = begin + size / 2;
 std::nth_element(begin, middle, end);
 return *middle;
 

 auto middle_low = begin + (size - 1) / 2;
 auto middle_high = begin + (size + 1) / 2;

 std::nth_element(begin, middle_high, end);
 std::nth_element(begin, middle_low, middle_high);

 return (*middle_low + *middle_high) / 2;

median_of_sorted has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.

median has $mathcalO(n)$ runtime complexity and might modify the container.

Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.

Note: The code from median/median_of_sorted can easily be extended to calculate any percentile.

Other issues

unsigned long int might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please use uint32_t or uint64_t from the header <cstdint>. If the bit length itself is not relevant, maybe use size_t (or simply unsigned) instead to prevent confusion.

Do you need the ordering of std::map? If not, you might want to use std::unordered_map instead (hash map instead of binary search tree).

Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.

edited Jul 16 at 16:07

answered Jul 14 at 17:16

hoffmale

4,205630

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f199493%2fbasic-single-header-statistics-and-ml-libray-for-c-scikit-learn-like-impleme%23new-answer', 'question_page');

);

Post as a guest

Name

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
4
down vote

accepted

Wrapper classes

I don't get why many algorithms (like get_mean, get_median, ...) are wrapped inside a class that is basically just a fancy namespace.

Why force the user to write mean.get_mean(...) if mean(...) would suffice?

Repetition

In many functions, there is a repeating pattern: The function takes a bool sorted parameter, and the first statement is if(!sorted) std::sort(...);.

First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.

For example, fixing these issues on median::get_medain

template<typename Iter>
auto median_of_sorted(Iter begin, Iter end) 
 auto size = std::distance(begin, end);

 if(size % 2 != 0) 
 return *(begin + size / 2);
 

 auto middle_low = *(begin + (size - 1) / 2);
 auto middle_high = *(begin + (size + 1) / 2);

 return (middle_low + middle_high) / 2;


template<typename Iter>
auto median(Iter begin, Iter end) 
 auto size = std::distance(begin, end);

 if(size % 2 != 0) 
 auto middle = begin + size / 2;
 std::nth_element(begin, middle, end);
 return *middle;
 

 auto middle_low = begin + (size - 1) / 2;
 auto middle_high = begin + (size + 1) / 2;

 std::nth_element(begin, middle_high, end);
 std::nth_element(begin, middle_low, middle_high);

 return (*middle_low + *middle_high) / 2;

median_of_sorted has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.

median has $mathcalO(n)$ runtime complexity and might modify the container.

Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.

Note: The code from median/median_of_sorted can easily be extended to calculate any percentile.

Other issues

unsigned long int might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please use uint32_t or uint64_t from the header <cstdint>. If the bit length itself is not relevant, maybe use size_t (or simply unsigned) instead to prevent confusion.

Do you need the ordering of std::map? If not, you might want to use std::unordered_map instead (hash map instead of binary search tree).

Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.

edited Jul 16 at 16:07

answered Jul 14 at 17:16

hoffmale

4,205630

add a commentÂ |Â

up vote
4
down vote

accepted

Wrapper classes

I don't get why many algorithms (like get_mean, get_median, ...) are wrapped inside a class that is basically just a fancy namespace.

Why force the user to write mean.get_mean(...) if mean(...) would suffice?

Repetition

In many functions, there is a repeating pattern: The function takes a bool sorted parameter, and the first statement is if(!sorted) std::sort(...);.

First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.

For example, fixing these issues on median::get_medain

template<typename Iter>
auto median_of_sorted(Iter begin, Iter end) 
 auto size = std::distance(begin, end);

 if(size % 2 != 0) 
 return *(begin + size / 2);
 

 auto middle_low = *(begin + (size - 1) / 2);
 auto middle_high = *(begin + (size + 1) / 2);

 return (middle_low + middle_high) / 2;


template<typename Iter>
auto median(Iter begin, Iter end) 
 auto size = std::distance(begin, end);

 if(size % 2 != 0) 
 auto middle = begin + size / 2;
 std::nth_element(begin, middle, end);
 return *middle;
 

 auto middle_low = begin + (size - 1) / 2;
 auto middle_high = begin + (size + 1) / 2;

 std::nth_element(begin, middle_high, end);
 std::nth_element(begin, middle_low, middle_high);

 return (*middle_low + *middle_high) / 2;

median_of_sorted has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.

median has $mathcalO(n)$ runtime complexity and might modify the container.

Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.

Note: The code from median/median_of_sorted can easily be extended to calculate any percentile.

Other issues

unsigned long int might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please use uint32_t or uint64_t from the header <cstdint>. If the bit length itself is not relevant, maybe use size_t (or simply unsigned) instead to prevent confusion.

Do you need the ordering of std::map? If not, you might want to use std::unordered_map instead (hash map instead of binary search tree).

Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.

edited Jul 16 at 16:07

answered Jul 14 at 17:16

hoffmale

4,205630

add a commentÂ |Â

up vote
4
down vote

accepted

Wrapper classes

I don't get why many algorithms (like get_mean, get_median, ...) are wrapped inside a class that is basically just a fancy namespace.

Why force the user to write mean.get_mean(...) if mean(...) would suffice?

Repetition

In many functions, there is a repeating pattern: The function takes a bool sorted parameter, and the first statement is if(!sorted) std::sort(...);.

First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.

For example, fixing these issues on median::get_medain

template<typename Iter>
auto median_of_sorted(Iter begin, Iter end) 
 auto size = std::distance(begin, end);

 if(size % 2 != 0) 
 return *(begin + size / 2);
 

 auto middle_low = *(begin + (size - 1) / 2);
 auto middle_high = *(begin + (size + 1) / 2);

 return (middle_low + middle_high) / 2;


template<typename Iter>
auto median(Iter begin, Iter end) 
 auto size = std::distance(begin, end);

 if(size % 2 != 0) 
 auto middle = begin + size / 2;
 std::nth_element(begin, middle, end);
 return *middle;
 

 auto middle_low = begin + (size - 1) / 2;
 auto middle_high = begin + (size + 1) / 2;

 std::nth_element(begin, middle_high, end);
 std::nth_element(begin, middle_low, middle_high);

 return (*middle_low + *middle_high) / 2;

median_of_sorted has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.

median has $mathcalO(n)$ runtime complexity and might modify the container.

Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.

Note: The code from median/median_of_sorted can easily be extended to calculate any percentile.

Other issues

unsigned long int might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please use uint32_t or uint64_t from the header <cstdint>. If the bit length itself is not relevant, maybe use size_t (or simply unsigned) instead to prevent confusion.

Do you need the ordering of std::map? If not, you might want to use std::unordered_map instead (hash map instead of binary search tree).

Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.

edited Jul 16 at 16:07

answered Jul 14 at 17:16

hoffmale

4,205630

Wrapper classes

I don't get why many algorithms (like get_mean, get_median, ...) are wrapped inside a class that is basically just a fancy namespace.

Why force the user to write mean.get_mean(...) if mean(...) would suffice?

Repetition

In many functions, there is a repeating pattern: The function takes a bool sorted parameter, and the first statement is if(!sorted) std::sort(...);.

First, such boolean flags usually smell. Often the corresponding function might do two different things, and might be better off being split into multiple specialized functions.

For example, fixing these issues on median::get_medain

template<typename Iter>
auto median_of_sorted(Iter begin, Iter end) 
 auto size = std::distance(begin, end);

 if(size % 2 != 0) 
 return *(begin + size / 2);
 

 auto middle_low = *(begin + (size - 1) / 2);
 auto middle_high = *(begin + (size + 1) / 2);

 return (middle_low + middle_high) / 2;


template<typename Iter>
auto median(Iter begin, Iter end) 
 auto size = std::distance(begin, end);

 if(size % 2 != 0) 
 auto middle = begin + size / 2;
 std::nth_element(begin, middle, end);
 return *middle;
 

 auto middle_low = begin + (size - 1) / 2;
 auto middle_high = begin + (size + 1) / 2;

 std::nth_element(begin, middle_high, end);
 std::nth_element(begin, middle_low, middle_high);

 return (*middle_low + *middle_high) / 2;

median_of_sorted has $mathcalO(1)$ runtime complexity (for random access iterators, $mathcalO(n)$ otherwise) and doesn't modify the container.

median has $mathcalO(n)$ runtime complexity and might modify the container.

Usually, it is known at the call site which algorithm is wanted. This also means that the decision to sort or not to sort a container gets pushed upwards, where there might be more information whether sorting is necessary.

Note: The code from median/median_of_sorted can easily be extended to calculate any percentile.

Other issues

unsigned long int might be 32 (e.g. on 32bit Windows, 32bit unix and 64bit Windows) or 64 (e.g. on 64bit unix) bit long! If a specific bit length is wanted, please use uint32_t or uint64_t from the header <cstdint>. If the bit length itself is not relevant, maybe use size_t (or simply unsigned) instead to prevent confusion.

Do you need the ordering of std::map? If not, you might want to use std::unordered_map instead (hash map instead of binary search tree).

Try to reduce copies if not necessary. Use iterators (if the object is a container), a reference or a pointer to refer to the object instead.

edited Jul 16 at 16:07

answered Jul 14 at 17:16

hoffmale

4,205630

edited Jul 16 at 16:07

answered Jul 14 at 17:16

hoffmale

4,205630

answered Jul 14 at 17:16

hoffmale

4,205630

answered Jul 14 at 17:16

hoffmale

4,205630

add a commentÂ |Â

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr

Basic Single Header statistics and ml libray for C++ - Scikit-Learn like implementation

1 Answer
1

Wrapper classes

Repetition

Other issues

Your Answer

Post as a guest

1 Answer
1

1 Answer
1

Wrapper classes

Repetition

Other issues

Wrapper classes

Repetition

Other issues

Wrapper classes

Repetition

Other issues

Wrapper classes

Repetition

Other issues

Post as a guest

Popular posts from this blog

Chat program with C++ and SFML

Read an image with ADNS2610 optical sensor and Arduino Uno

Read files from a directory using Promises

Basic Single Header statistics and ml libray for C++ - Scikit-Learn like implementation

1 Answer 1

Wrapper classes

Repetition

Other issues

Your Answer

Sign up or log in

Post as a guest

Post as a guest

1 Answer 1

1 Answer 1

Wrapper classes

Repetition

Other issues

Wrapper classes

Repetition

Other issues

Wrapper classes

Repetition

Other issues

Wrapper classes

Repetition

Other issues

Sign up or log in

Post as a guest

Post as a guest

Sign up or log in

Post as a guest

Sign up or log in

Post as a guest

Sign up or log in

Post as a guest

Popular posts from this blog

Chat program with C++ and SFML

Read an image with ADNS2610 optical sensor and Arduino Uno

Read files from a directory using Promises

1 Answer
1

1 Answer
1

1 Answer
1