GispoCoding · nmaarnio · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024
diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py
@@ -124,7 +124,7 @@ def correlation_matrix(
 ) -> pd.DataFrame:
     """Compute correlation matrix on the input data.
 
-    It is assumed that the data is numeric, i.e. integers or floats.
+    It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations.
 
     Args:
         data: Dataframe containing the input data.
@@ -136,11 +136,15 @@ def correlation_matrix(
         InvalidParameterValueException: min_periods argument is used with method 'kendall'.
 
     Returns:
-        Dataframe containing the correlation matrix
+        Dataframe containing matrix representing the correlation coefficient \
+            between the corresponding pair of variables.
     """
     if check_empty_dataframe(data):
         raise exceptions.EmptyDataFrameException("The input Dataframe is empty.")
 
+    if not check_columns_numeric(data, data.columns.to_list()):
+        raise exceptions.NonNumericDataException("The input data contain non-numeric data.")
+
     if correlation_method == "kendall" and min_periods is not None:
         raise exceptions.InvalidParameterValueException(
             "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'."
@@ -157,7 +161,7 @@ def covariance_matrix(
 ) -> pd.DataFrame:
     """Compute covariance matrix on the input data.
 
-    It is assumed that the data is numeric, i.e. integers or floats.
+    It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations.
 
     Args:
         data: Dataframe containing the input data.
@@ -166,17 +170,23 @@ def covariance_matrix(
 
     Raises:
         EmptyDataFrameException: The input Dataframe is empty.
-        InvalidParameterValueException: Provided value for delta_degrees_of_freedom is negative.
+        InvalidParameterValueException: Provided value for delta_degrees_of_freedom or min_periods is negative.
 
     Returns:
-        Dataframe containing the covariance matrix
+        Dataframe containing matrix representing the covariance between the corresponding pair of variables.
     """
     if check_empty_dataframe(data):
         raise exceptions.EmptyDataFrameException("The input Dataframe is empty.")
 
+    if not check_columns_numeric(data, data.columns.to_list()):
+        raise exceptions.NonNumericDataException("The input data contain non-numeric data.")
+
     if delta_degrees_of_freedom < 0:
         raise exceptions.InvalidParameterValueException("Delta degrees of freedom must be non-negative.")
 
+    if min_periods and min_periods < 0:
+        raise exceptions.InvalidParameterValueException("Min perioids must be non-negative.")
+
     matrix = data.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom)
 
     return matrix
diff --git a/tests/exploratory_analyses/statistical_tests_test.py b/tests/exploratory_analyses/statistical_tests_test.py
@@ -12,11 +12,11 @@
 )
 
 data = np.array([[0, 1, 2, 1], [2, 0, 1, 2], [2, 1, 0, 2], [0, 1, 2, 1]])
-missing_data = np.array([[0, 1, 2, 1, np.nan], [2, 0, 1, 2, np.nan], [2, 1, 0, 2, np.nan], [0, 1, 2, 1, np.nan]])
+missing_data = np.array([[0, 1, 2, 1], [2, 0, np.nan, 2], [2, 1, 0, 2], [0, 1, 2, 1]])
 non_numeric_data = np.array([[0, 1, 2, 1], ["a", "b", "c", "d"], [3, 2, 1, 0], ["c", "d", "b", "a"]])
 numeric_data = pd.DataFrame(data, columns=["a", "b", "c", "d"])
 non_numeric_df = pd.DataFrame(non_numeric_data, columns=["a", "b", "c", "d"])
-missing_values_df = pd.DataFrame(missing_data, columns=["a", "b", "c", "d", "na"])
+missing_values_df = pd.DataFrame(missing_data, columns=["a", "b", "c", "d"])
 categorical_data = pd.DataFrame({"e": [0, 0, 1, 1], "f": [True, False, True, True]})
 target_column = "e"
 np.random.seed(42)
@@ -43,13 +43,27 @@ def test_normality_test():
 def test_normality_test_missing_data():
     """Test that input with missing data returns statistics correctly."""
     output_statistics = normality_test(data=missing_data)
-    np.testing.assert_array_almost_equal(output_statistics, (0.8077, 0.00345), decimal=5)
+    np.testing.assert_array_almost_equal(output_statistics, (0.79921, 0.00359), decimal=5)
     output_statistics = normality_test(data=np.array([0, 2, 2, 0, np.nan]))
     np.testing.assert_array_almost_equal(output_statistics, (0.72863, 0.02386), decimal=5)
     output_statistics = normality_test(data=missing_values_df, columns=["a", "b"])
     np.testing.assert_array_almost_equal(output_statistics["a"], (0.72863, 0.02386), decimal=5)
 
 
+def test_correlation_matrix_nan():
+    """Test that returned correlation matrix is correct, when NaN present in the dataframe."""
+    expected_correlation_matrix = np.array(
+        [
+            [1.000000, -0.577350, -1.000000, 1.000000],
+            [-0.577350, 1.000000, np.nan, -0.577350],
+            [-1.000000, np.nan, 1.000000, -1.000000],
+            [1.000000, -0.577350, -1.000000, 1.000000],
+        ]
+    )
+    output_matrix = correlation_matrix(data=missing_values_df)
+    np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix)
+
+
 def test_correlation_matrix():
     """Test that returned correlation matrix is correct."""
     expected_correlation_matrix = np.array(
@@ -64,6 +78,26 @@ def test_correlation_matrix():
     np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix)
 
 
+def test_correlation_matrix_non_numeric():
+    """Test that returned correlation matrix is correct."""
+    with pytest.raises(exceptions.NonNumericDataException):
+        correlation_matrix(data=non_numeric_df)
+
+
+def test_covariance_matrix_nan():
+    """Test that returned covariance matrix is correct, when NaN present in the dataframe."""
+    expected_correlation_matrix = np.array(
+        [
+            [1.333333, -0.333333, -1.333333, 0.666667],
+            [-0.333333, 0.25, 0, -0.166667],
+            [-1.333333, 0, 1.333333, -0.666667],
+            [0.666667, -0.166667, -0.666667, 0.333333],
+        ]
+    )
+    output_matrix = covariance_matrix(data=missing_values_df)
+    np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix)
+
+
 def test_covariance_matrix():
     """Test that returned covariance matrix is correct."""
     expected_covariance_matrix = np.array(
@@ -78,6 +112,12 @@ def test_covariance_matrix():
     np.testing.assert_array_almost_equal(output_matrix, expected_covariance_matrix)
 
 
+def test_covariance_matrix_negative_min_periods():
+    """Test that negative min_periods value raises the correct exception."""
+    with pytest.raises(exceptions.InvalidParameterValueException):
+        covariance_matrix(data=numeric_data, min_periods=-1)
+
+
 def test_empty_df():
     """Test that empty DataFrame raises the correct exception."""
     empty_df = pd.DataFrame()