Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

265 improve covariance and correlation matrix functions #268

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions eis_toolkit/exploratory_analyses/statistical_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def correlation_matrix(
) -> pd.DataFrame:
"""Compute correlation matrix on the input data.

It is assumed that the data is numeric, i.e. integers or floats.
It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations.

Args:
data: Dataframe containing the input data.
Expand All @@ -136,11 +136,15 @@ def correlation_matrix(
InvalidParameterValueException: min_periods argument is used with method 'kendall'.

Returns:
Dataframe containing the correlation matrix
Dataframe containing matrix representing the correlation coefficient \
between the corresponding pair of variables.
"""
if check_empty_dataframe(data):
raise exceptions.EmptyDataFrameException("The input Dataframe is empty.")

if not check_columns_numeric(data, data.columns.to_list()):
raise exceptions.NonNumericDataException("The input data contain non-numeric data.")

if correlation_method == "kendall" and min_periods is not None:
raise exceptions.InvalidParameterValueException(
"The argument min_periods is available only with correlation methods 'pearson' and 'spearman'."
Expand All @@ -157,7 +161,7 @@ def covariance_matrix(
) -> pd.DataFrame:
"""Compute covariance matrix on the input data.

It is assumed that the data is numeric, i.e. integers or floats.
It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations.

Args:
data: Dataframe containing the input data.
Expand All @@ -166,17 +170,23 @@ def covariance_matrix(

Raises:
EmptyDataFrameException: The input Dataframe is empty.
InvalidParameterValueException: Provided value for delta_degrees_of_freedom is negative.
InvalidParameterValueException: Provided value for delta_degrees_of_freedom or min_periods is negative.

Returns:
Dataframe containing the covariance matrix
Dataframe containing matrix representing the covariance between the corresponding pair of variables.
"""
if check_empty_dataframe(data):
raise exceptions.EmptyDataFrameException("The input Dataframe is empty.")

if not check_columns_numeric(data, data.columns.to_list()):
raise exceptions.NonNumericDataException("The input data contain non-numeric data.")

if delta_degrees_of_freedom < 0:
raise exceptions.InvalidParameterValueException("Delta degrees of freedom must be non-negative.")

if min_periods and min_periods < 0:
raise exceptions.InvalidParameterValueException("Min perioids must be non-negative.")

matrix = data.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom)

return matrix
46 changes: 43 additions & 3 deletions tests/exploratory_analyses/statistical_tests_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
)

data = np.array([[0, 1, 2, 1], [2, 0, 1, 2], [2, 1, 0, 2], [0, 1, 2, 1]])
missing_data = np.array([[0, 1, 2, 1, np.nan], [2, 0, 1, 2, np.nan], [2, 1, 0, 2, np.nan], [0, 1, 2, 1, np.nan]])
missing_data = np.array([[0, 1, 2, 1], [2, 0, np.nan, 2], [2, 1, 0, 2], [0, 1, 2, 1]])
non_numeric_data = np.array([[0, 1, 2, 1], ["a", "b", "c", "d"], [3, 2, 1, 0], ["c", "d", "b", "a"]])
numeric_data = pd.DataFrame(data, columns=["a", "b", "c", "d"])
non_numeric_df = pd.DataFrame(non_numeric_data, columns=["a", "b", "c", "d"])
missing_values_df = pd.DataFrame(missing_data, columns=["a", "b", "c", "d", "na"])
missing_values_df = pd.DataFrame(missing_data, columns=["a", "b", "c", "d"])
categorical_data = pd.DataFrame({"e": [0, 0, 1, 1], "f": [True, False, True, True]})
target_column = "e"
np.random.seed(42)
Expand All @@ -43,13 +43,27 @@ def test_normality_test():
def test_normality_test_missing_data():
"""Test that input with missing data returns statistics correctly."""
output_statistics = normality_test(data=missing_data)
np.testing.assert_array_almost_equal(output_statistics, (0.8077, 0.00345), decimal=5)
np.testing.assert_array_almost_equal(output_statistics, (0.79921, 0.00359), decimal=5)
output_statistics = normality_test(data=np.array([0, 2, 2, 0, np.nan]))
np.testing.assert_array_almost_equal(output_statistics, (0.72863, 0.02386), decimal=5)
output_statistics = normality_test(data=missing_values_df, columns=["a", "b"])
np.testing.assert_array_almost_equal(output_statistics["a"], (0.72863, 0.02386), decimal=5)


def test_correlation_matrix_nan():
"""Test that returned correlation matrix is correct, when NaN present in the dataframe."""
expected_correlation_matrix = np.array(
[
[1.000000, -0.577350, -1.000000, 1.000000],
[-0.577350, 1.000000, np.nan, -0.577350],
[-1.000000, np.nan, 1.000000, -1.000000],
[1.000000, -0.577350, -1.000000, 1.000000],
]
)
output_matrix = correlation_matrix(data=missing_values_df)
np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix)


def test_correlation_matrix():
"""Test that returned correlation matrix is correct."""
expected_correlation_matrix = np.array(
Expand All @@ -64,6 +78,26 @@ def test_correlation_matrix():
np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix)


def test_correlation_matrix_non_numeric():
"""Test that returned correlation matrix is correct."""
with pytest.raises(exceptions.NonNumericDataException):
correlation_matrix(data=non_numeric_df)


def test_covariance_matrix_nan():
"""Test that returned covariance matrix is correct, when NaN present in the dataframe."""
expected_correlation_matrix = np.array(
[
[1.333333, -0.333333, -1.333333, 0.666667],
[-0.333333, 0.25, 0, -0.166667],
[-1.333333, 0, 1.333333, -0.666667],
[0.666667, -0.166667, -0.666667, 0.333333],
]
)
output_matrix = covariance_matrix(data=missing_values_df)
np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix)


def test_covariance_matrix():
"""Test that returned covariance matrix is correct."""
expected_covariance_matrix = np.array(
Expand All @@ -78,6 +112,12 @@ def test_covariance_matrix():
np.testing.assert_array_almost_equal(output_matrix, expected_covariance_matrix)


def test_covariance_matrix_negative_min_periods():
"""Test that negative min_periods value raises the correct exception."""
with pytest.raises(exceptions.InvalidParameterValueException):
covariance_matrix(data=numeric_data, min_periods=-1)


def test_empty_df():
"""Test that empty DataFrame raises the correct exception."""
empty_df = pd.DataFrame()
Expand Down
Loading