Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement distance_to_anomaly (Closes #209) #324

Merged
merged 17 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/raster_processing/distance_to_anomaly.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Distance to anomaly

::: eis_toolkit.raster_processing.distance_to_anomaly
224 changes: 224 additions & 0 deletions eis_toolkit/raster_processing/distance_to_anomaly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
from itertools import chain
from numbers import Number
from pathlib import Path
from tempfile import TemporaryDirectory

import geopandas as gpd
import numpy as np
import rasterio
from beartype import beartype
from beartype.typing import Literal, Optional, Tuple, Union
from rasterio import profiles

from eis_toolkit.exceptions import InvalidParameterValueException
from eis_toolkit.utilities.checks.raster import check_raster_profile
from eis_toolkit.utilities.miscellaneous import row_points, toggle_gdal_exceptions
from eis_toolkit.vector_processing.distance_computation import distance_computation


def _check_threshold_criteria_and_value(threshold_criteria, threshold_criteria_value):
if threshold_criteria in {"lower", "higher"} and not isinstance(threshold_criteria_value, Number):
raise InvalidParameterValueException(
f"Expected threshold_criteria_value {threshold_criteria_value} "
"to be a single number rather than a tuple."
)
if threshold_criteria in {"in_between", "outside"}:
if not isinstance(threshold_criteria_value, tuple):
raise InvalidParameterValueException(
f"Expected threshold_criteria_value ({threshold_criteria_value}) to be a tuple rather than a number."
)
if threshold_criteria_value[0] >= threshold_criteria_value[1]:
raise InvalidParameterValueException(
f"Expected first value in threshold_criteria_value ({threshold_criteria_value})"
"tuple to be lower than the second."
)


@beartype
def distance_to_anomaly(
anomaly_raster_profile: Union[profiles.Profile, dict],
anomaly_raster_data: np.ndarray,
threshold_criteria_value: Union[Tuple[Number, Number], Number],
threshold_criteria: Literal["lower", "higher", "in_between", "outside"],
) -> Tuple[np.ndarray, Union[profiles.Profile, dict]]:
"""Calculate distance from raster cell to nearest anomaly.

The criteria for what is anomalous can be defined as a single number and
criteria text of "higher" or "lower". Alternatively, the definition can be
a range where values inside (criteria text of "within") or outside are
marked as anomalous (criteria text of "outside"). If anomaly_raster_profile does
contain "nodata" key, np.nan is assumed to correspond to nodata values.

Args:
anomaly_raster_profile: The raster profile in which the distances
to the nearest anomalous value are determined.
anomaly_raster_data: The raster data in which the distances
to the nearest anomalous value are determined.
threshold_criteria_value: Value(s) used to define anomalous.
If the threshold criteria requires a tuple of values,
the first value should be the minimum and the second
the maximum value.
threshold_criteria: Method to define anomalous.

Returns:
A 2D numpy array with the distances to anomalies computed
and the original anomaly raster profile.

"""
check_raster_profile(raster_profile=anomaly_raster_profile)
_check_threshold_criteria_and_value(
threshold_criteria=threshold_criteria, threshold_criteria_value=threshold_criteria_value
)

out_image = _distance_to_anomaly(
anomaly_raster_profile=anomaly_raster_profile,
anomaly_raster_data=anomaly_raster_data,
threshold_criteria=threshold_criteria,
threshold_criteria_value=threshold_criteria_value,
)
return out_image, anomaly_raster_profile


@beartype
def distance_to_anomaly_gdal(
anomaly_raster_profile: Union[profiles.Profile, dict],
anomaly_raster_data: np.ndarray,
threshold_criteria_value: Union[Tuple[Number, Number], Number],
threshold_criteria: Literal["lower", "higher", "in_between", "outside"],
output_path: Path,
verbose: bool = False,
) -> Path:
"""Calculate distance from raster cell to nearest anomaly.

Distance is calculated for each cell in the anomaly raster and saved to a
new raster at output_path. The criteria for what is anomalous can be
defined as a single number and criteria text of "higher" or "lower".
Alternatively, the definition can be a range where values inside
(criteria text of "within") or outside are marked as anomalous
(criteria text of "outside"). If anomaly_raster_profile does
contain "nodata" key, np.nan is assumed to correspond to nodata values.

Does not work on Windows.

Args:
anomaly_raster_profile: The raster profile in which the distances
to the nearest anomalous value are determined.
anomaly_raster_data: The raster data in which the distances
to the nearest anomalous value are determined.
threshold_criteria_value: Value(s) used to define anomalous.
threshold_criteria: Method to define anomalous.
output_path: The path to the raster with the distances to anomalies
calculated.
verbose: Whether to print gdal_proximity output.

Returns:
The path to the raster with the distances to anomalies calculated.
"""
check_raster_profile(raster_profile=anomaly_raster_profile)
_check_threshold_criteria_and_value(
threshold_criteria=threshold_criteria, threshold_criteria_value=threshold_criteria_value
)

return _distance_to_anomaly_gdal(
output_path=output_path,
anomaly_raster_profile=anomaly_raster_profile,
anomaly_raster_data=anomaly_raster_data,
threshold_criteria=threshold_criteria,
threshold_criteria_value=threshold_criteria_value,
verbose=verbose,
)


def _fits_criteria(
threshold_criteria_value: Union[Tuple[Number, Number], Number],
threshold_criteria: Literal["lower", "higher", "in_between", "outside"],
anomaly_raster_data: np.ndarray,
nodata_value: Optional[Number],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this default to Optional[Number] = None?

(The value of the Optional type-hinted parameter doesn't seem to get set to None by default if omitted. The type hint just allows for the type to be either a number or NoneType.)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the function is a private one I think it is best to make sure all inputs are explicitly passed when using the function so no functionality is missed. Using Optional does not mean you need to set the default value, it just means that input is allowed to be passed as None.

) -> np.ndarray:

criteria_dict = {
"lower": lambda anomaly_raster_data: anomaly_raster_data < threshold_criteria_value,
"higher": lambda anomaly_raster_data: anomaly_raster_data > threshold_criteria_value,
"in_between": lambda anomaly_raster_data: np.logical_and(
anomaly_raster_data > threshold_criteria_value[0], # type: ignore
anomaly_raster_data < threshold_criteria_value[1], # type: ignore
),
"outside": lambda anomaly_raster_data: np.logical_or(
anomaly_raster_data < threshold_criteria_value[0], # type: ignore
anomaly_raster_data > threshold_criteria_value[1], # type: ignore
),
}
mask = anomaly_raster_data == nodata_value if nodata_value is not None else np.isnan(anomaly_raster_data)

return np.where(mask, False, criteria_dict[threshold_criteria](anomaly_raster_data))


def _write_binary_anomaly_raster(tmp_dir: Path, anomaly_raster_profile, data_fits_criteria: np.ndarray):
anomaly_raster_binary_path = tmp_dir / "anomaly_raster_binary.tif"

anomaly_raster_binary_profile = {**anomaly_raster_profile, **dict(dtype=rasterio.uint8, count=1, nodata=None)}
with rasterio.open(anomaly_raster_binary_path, mode="w", **anomaly_raster_binary_profile) as anomaly_raster_binary:
anomaly_raster_binary.write(data_fits_criteria.astype(rasterio.uint8), 1)

return anomaly_raster_binary_path


def _distance_to_anomaly_gdal(
anomaly_raster_profile: Union[profiles.Profile, dict],
anomaly_raster_data: np.ndarray,
threshold_criteria_value: Union[Tuple[Number, Number], Number],
threshold_criteria: Literal["lower", "higher", "in_between", "outside"],
output_path: Path,
verbose: bool,
):
from osgeo_utils import gdal_proximity

data_fits_criteria = _fits_criteria(
threshold_criteria=threshold_criteria,
threshold_criteria_value=threshold_criteria_value,
anomaly_raster_data=anomaly_raster_data,
nodata_value=anomaly_raster_profile.get("nodata"),
)

with TemporaryDirectory() as tmp_dir_str:
tmp_dir = Path(tmp_dir_str)
anomaly_raster_binary_path = _write_binary_anomaly_raster(
tmp_dir=tmp_dir, anomaly_raster_profile=anomaly_raster_profile, data_fits_criteria=data_fits_criteria
)
with toggle_gdal_exceptions():
gdal_proximity.gdal_proximity(
src_filename=str(anomaly_raster_binary_path),
dst_filename=str(output_path),
alg_options=("VALUES=1", "DISTUNITS=GEO"),
quiet=not verbose,
)

return output_path


def _distance_to_anomaly(
anomaly_raster_profile: Union[profiles.Profile, dict],
anomaly_raster_data: np.ndarray,
threshold_criteria_value: Union[Tuple[Number, Number], Number],
threshold_criteria: Literal["lower", "higher", "in_between", "outside"],
) -> np.ndarray:
data_fits_criteria = _fits_criteria(
threshold_criteria=threshold_criteria,
threshold_criteria_value=threshold_criteria_value,
anomaly_raster_data=anomaly_raster_data,
nodata_value=anomaly_raster_profile.get("nodata"),
)

cols = np.arange(anomaly_raster_data.shape[1])
rows = np.arange(anomaly_raster_data.shape[0])

all_points_by_rows = [
row_points(row=row, cols=cols[data_fits_criteria[row]], raster_transform=anomaly_raster_profile["transform"])
for row in rows
]
all_points = list(chain(*all_points_by_rows))
all_points_gdf = gpd.GeoDataFrame(geometry=all_points, crs=anomaly_raster_profile["crs"])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Just a suggestion, feel free to ignore this if you think best.)

If all_points ends up empty due to nothing matching the criteria, this function could raise an exception with an informative message. The distance_computation does raise an exception when checking the shape, but the message is sort of generic, and it might be good to let the user know why it's happened in this case.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, added a check that the criteria matches at least some of the data!


distance_array = distance_computation(raster_profile=anomaly_raster_profile, geometries=all_points_gdf)

return distance_array
29 changes: 29 additions & 0 deletions eis_toolkit/utilities/checks/raster.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import rasterio
import rasterio.profiles
import rasterio.transform
from beartype import beartype
from beartype.typing import Iterable, Sequence, Union

from eis_toolkit.exceptions import InvalidParameterValueException


@beartype
def check_matching_cell_size(
Expand Down Expand Up @@ -166,3 +170,28 @@ def check_quadratic_pixels(raster: rasterio.io.DatasetReader) -> bool:
return True
else:
return False


@beartype
def check_raster_profile(
raster_profile: Union[rasterio.profiles.Profile, dict],
):
"""Check raster profile values.

Checks that width and height are sensible and that the profile contains a
transform.
"""
raster_width = raster_profile.get("width")
raster_height = raster_profile.get("height")

if not isinstance(raster_width, int) or not isinstance(raster_height, int):
raise InvalidParameterValueException(
f"Expected raster_profile to contain integer width and height. {raster_profile}"
)

raster_transform = raster_profile.get("transform")

if not isinstance(raster_transform, rasterio.transform.Affine):
raise InvalidParameterValueException(
f"Expected raster_profile to contain an affine transformation. {raster_profile}"
)
48 changes: 48 additions & 0 deletions eis_toolkit/utilities/miscellaneous.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from contextlib import contextmanager
from numbers import Number

import numpy as np
import pandas as pd
from beartype import beartype
from beartype.typing import Any, List, Optional, Sequence, Tuple, Union
from osgeo import gdal
from rasterio import transform
from shapely.geometry import Point

from eis_toolkit.exceptions import InvalidColumnException, InvalidColumnIndexException, InvalidDataShapeException
from eis_toolkit.utilities.checks.dataframe import check_columns_valid
Expand Down Expand Up @@ -362,3 +366,47 @@ def rename_columns(df: pd.DataFrame, colnames=Sequence[str]) -> pd.DataFrame:
names[columns[i]] = colnames[i]

return df.rename(columns=names)


def row_points(
row: int,
cols: np.ndarray,
raster_transform: transform.Affine,
) -> List[Point]:
"""Transform raster row cells to shapely Points.

Args:
row: Row index of cells to transfer
cols: Array of column indexes to transfer
raster_transform: Affine transformation matrix of the raster

Returns:
List of shapely Points
"""
# transform.xy accepts either cols or rows as an array. The other then has
# to be an integer. The resulting x and y point coordinates are therefore
# in a 1D array

if len(cols) == 0:
return []

point_xs, point_ys = zip(*[raster_transform * (col + 0.5, row + 0.5) for col in cols])
# point_xs, point_ys = transform.xy(transform=raster_transform, cols=cols, rows=row)
return [Point(x, y) for x, y in zip(point_xs, point_ys)]


@contextmanager
def toggle_gdal_exceptions():
"""Toggle GDAL exceptions using a context manager.

If the exceptions are already enabled, this function will do nothing.
"""
already_has_exceptions_enabled = False
try:
if gdal.GetUseExceptions() != 0:
already_has_exceptions_enabled = True
gdal.UseExceptions()
yield
finally:
if not already_has_exceptions_enabled:
gdal.DontUseExceptions()
30 changes: 11 additions & 19 deletions eis_toolkit/vector_processing/distance_computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from beartype import beartype
from beartype.typing import Union
from rasterio import profiles, transform
from shapely.geometry import Point
from shapely.geometry.base import BaseGeometry, BaseMultipartGeometry

from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonMatchingCrsException
from eis_toolkit.exceptions import EmptyDataFrameException, NonMatchingCrsException
from eis_toolkit.utilities.checks.raster import check_raster_profile
from eis_toolkit.utilities.miscellaneous import row_points


@beartype
Expand All @@ -27,21 +28,12 @@ def distance_computation(raster_profile: Union[profiles.Profile, dict], geometri
if geometries.shape[0] == 0:
raise EmptyDataFrameException("Expected GeoDataFrame to not be empty.")

check_raster_profile(raster_profile=raster_profile)

raster_width = raster_profile.get("width")
raster_height = raster_profile.get("height")

if not isinstance(raster_width, int) or not isinstance(raster_height, int):
raise InvalidParameterValueException(
f"Expected raster_profile to contain integer width and height. {raster_profile}"
)

raster_transform = raster_profile.get("transform")

if not isinstance(raster_transform, transform.Affine):
raise InvalidParameterValueException(
f"Expected raster_profile to contain an affine transformation. {raster_profile}"
)

return _distance_computation(
raster_width=raster_width, raster_height=raster_height, raster_transform=raster_transform, geometries=geometries
)
Expand All @@ -53,12 +45,12 @@ def _calculate_row_distances(
raster_transform: transform.Affine,
geometries_unary_union: Union[BaseGeometry, BaseMultipartGeometry],
) -> np.ndarray:
# transform.xy accepts either cols or rows as an array. The other then has
# to be an integer. The resulting x and y point coordinates are therefore
# in a 1D array
point_xs, point_ys = transform.xy(transform=raster_transform, cols=cols, rows=row)
row_points = [Point(x, y) for x, y in zip(point_xs, point_ys)]
row_distances = np.array([point.distance(geometries_unary_union) for point in row_points])
row_distances = np.array(
[
point.distance(geometries_unary_union)
for point in row_points(row=row, cols=cols, raster_transform=raster_transform)
]
)
return row_distances


Expand Down
Loading
Loading