Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP - NNFDM implementation in AXL #131

Draft
wants to merge 39 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
6951d3a
Prototype for NNFDM implementation in AXL
mcfadden8 Aug 1, 2022
ffb4dcc
Use new HPE C++ NNFDM library
mcfadden8 Oct 4, 2022
18b0cfb
Leave NNFDM off by default for now
mcfadden8 Oct 4, 2022
2dc695c
Add NNFDM Cancel
mcfadden8 Oct 17, 2022
bae2a8d
Turn HPE data mover back on
mcfadden8 Jan 5, 2023
d52d9a9
make nnfdm inclusion automatic
mcfadden8 Jan 6, 2023
6d8cf6c
Merge branch 'main' into mcfadden8/add-dm-client
mcfadden8 Jan 6, 2023
e719849
Added basic test of nnf-dm library
mcfadden8 Feb 14, 2023
0277f59
Checkpoint: SCR now successfully is using nnfdm to copy files. The n…
mcfadden8 Feb 18, 2023
13bdf5f
Initial working version with single rank
mcfadden8 Feb 27, 2023
e64691e
Removed unused code
mcfadden8 Apr 27, 2023
bc0f07f
Minor debug statement updates
mcfadden8 Apr 27, 2023
2f7479b
Updates to build with latest version of nnfdm library
mcfadden8 May 8, 2023
5c2d38d
Update signatures to latest nnfdm library
mcfadden8 May 9, 2023
9f60116
Updates from main
mcfadden8 May 9, 2023
e14bd94
Added find dependency for NNFDM if/when NNFDM is included
mcfadden8 May 9, 2023
830cee7
Add nnfdm to list of external static libs for AXL
mcfadden8 May 10, 2023
2958a74
Added nnf user container example
mcfadden8 Jun 27, 2023
45a0b4a
Updates
mcfadden8 Jun 27, 2023
fdb967d
Working set of hpe containers for rabbits
mcfadden8 Jul 10, 2023
6d2b38b
Added documentation for building Dockerfile
mcfadden8 Jul 10, 2023
a2e3e45
Initial container created
mcfadden8 Jul 10, 2023
638f969
Updates
mcfadden8 Jul 10, 2023
d911cf6
Added container for Hari
mcfadden8 Jul 10, 2023
5755781
Added container for Cameron
mcfadden8 Jul 10, 2023
e1552c6
Updates
mcfadden8 Jul 10, 2023
a1064ce
Updates
mcfadden8 Aug 3, 2023
da4b28c
Another update
mcfadden8 Aug 4, 2023
0f9b660
Separate nnf containers for hetchy and tioga
mcfadden8 Aug 4, 2023
b040ff6
Make NNFDM inclusion optional
mcfadden8 Aug 7, 2023
b1a372b
Merge branch 'main' into mcfadden8/add-dm-client
mcfadden8 Aug 7, 2023
33bb280
Updates: run some mpiGraph tests on rabbits
mcfadden8 Oct 2, 2023
87068a9
Updates for v0.0.6 version of library
mcfadden8 Oct 2, 2023
da33014
Added new profile parameter for nnfdm::CreateResponse API
mcfadden8 Jun 7, 2024
28922bd
Improved error handling around nnfdm create
mcfadden8 Jun 18, 2024
ac58578
Added more debugging capabilities
mcfadden8 Aug 5, 2024
2608e22
Use SCR data movement profile
mcfadden8 Aug 28, 2024
e59fc12
Work with nnfdm SCR profile
mcfadden8 Aug 29, 2024
6598792
axl: Update to latest v0.1.7 nnf-dm version
mcfadden8 Sep 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ MESSAGE(STATUS "ENABLE_IBM_BBAPI: ${ENABLE_IBM_BBAPI}")
OPTION(ENABLE_CRAY_DW "Whether to enable Cray Datawarp support" OFF)
MESSAGE(STATUS "ENABLE_CRAY_DW: ${ENABLE_CRAY_DW}")

OPTION(ENABLE_HPE_NNFDM "Whether to enable HPE Data Mover support" ON)
MESSAGE(STATUS "ENABLE_HPE_NNFDM: ${ENABLE_HPE_NNFDM}")

OPTION(ENABLE_TESTS "Whether to build tests" ON)
MESSAGE(STATUS "ENABLE_TESTS: ${ENABLE_TESTS}")

Expand Down Expand Up @@ -80,6 +83,16 @@ IF(ENABLE_CRAY_DW)
# LIST(APPEND AXL_LINK_LINE " -L${WITH_DATAWARP_PREFIX}/lib64 -ldatawarp")
ENDIF(ENABLE_CRAY_DW)

## DataMover
IF(ENABLE_HPE_NNFDM)
FIND_PACKAGE(NNFDM)
IF(NNFDM_FOUND)
SET(HAVE_NNFDM TRUE)
INCLUDE_DIRECTORIES(${NNFDM_INCLUDE_DIRS})
LIST(APPEND AXL_EXTERNAL_LIBS ${NNFDM_LIBRARIES})
ENDIF(NNFDM_FOUND)
ENDIF(ENABLE_HPE_NNFDM)

## IBM Burst Buffer API
IF(ENABLE_IBM_BBAPI)
FIND_PACKAGE(BBAPI)
Expand Down
41 changes: 41 additions & 0 deletions cmake/FindNNFDM.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# - Try to find libnnfdm
#
# Once done this will define
# NNFDM_FOUND - System has libdatawarp
# NNFDM_INCLUDE_DIRS - The libdatawarp include directories
# NNFDM_LIBRARIES - The libraries needed to use libdatawarp
#
# This is early days for this library. For now, we assume that there is
# a the following directory and contents pointed to by WITH_NNFDM_PREFIX:
#
# path/nnfdm/
# lib64/
# libnnfdm.a
# include/
# datamovement.pb-c.h
# nnfdm.h
#
# So, the following cmake line will cause this to be found assuming the prefix exists:
# cmake -DWITH_NNFDM_PREFIX="/usr/WS2/martymcf/scr/dm/nnfdm" -DMPI=ON ..

FIND_LIBRARY(NNFDM_LIBRARIES
NAMES nnfdm
HINTS ${WITH_NNFDM_PREFIX}/lib64
)

FIND_PATH(NNFDM_INCLUDE_DIRS
NAMES nnfdm.h
HINTS ${WITH_NNFDM_PREFIX}/include
)

INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(NNFDM DEFAULT_MSG
NNFDM_LIBRARIES
NNFDM_INCLUDE_DIRS
)

# Hide these vars from ccmake GUI
MARK_AS_ADVANCED(
NNFDM_LIBRARIES
NNFDM_INCLUDE_DIRS
)
1 change: 1 addition & 0 deletions cmake/config.h.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Machine Specific Libs
#cmakedefine HAVE_PTHREADS
#cmakedefine HAVE_DATAWARP
#cmakedefine HAVE_NNFDM
#cmakedefine HAVE_BBAPI
#cmakedefine HAVE_BBAPI_FALLBACK
4 changes: 4 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ IF(HAVE_DATAWARP)
LIST(APPEND libaxl_srcs axl_async_datawarp.c)
ENDIF(HAVE_DATAWARP)

IF(HAVE_NNFDM)
LIST(APPEND libaxl_srcs axl_async_nnfdm.c)
ENDIF(HAVE_NNFDM)

# Default AXL library is withOUT MPI
ADD_LIBRARY(axl_o OBJECT ${libaxl_srcs})

Expand Down
59 changes: 57 additions & 2 deletions src/axl.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
#include "axl_async_bbapi.h"
#endif /* HAVE_BBAPI */

#ifdef HAVE_NNFDM
#include "axl_async_nnfdm.h"
#endif /* HAVE_NNFDM */

#ifdef HAVE_DATAWARP
#include "axl_async_datawarp.h"
#endif /* HAVE_DATAWARP */
Expand Down Expand Up @@ -91,6 +95,7 @@ static unsigned int axl_kvtrees_count = 0;
static int bbapi_is_loaded = 0;
#endif


/* Allocate a new kvtree and return the AXL ID for it. If state_file is
* specified, then populate the kvtree with it's data. */
static int axl_alloc_id(const char* state_file)
Expand Down Expand Up @@ -205,10 +210,12 @@ static axl_xfer_t axl_detect_native_xfer(void)
* DataWarp libraries. In the real world, our supercomputer is only going
* to have one of those libraries, so just use whatever we find at
* build time. */
#ifdef HAVE_BBAPI
#if defined(HAVE_BBAPI)
xtype = AXL_XFER_ASYNC_BBAPI;
#elif HAVE_DATAWARP
#elif defined(HAVE_DATAWARP)
xtype = AXL_XFER_ASYNC_DW;
#elif defined(HAVE_NNFDM)
xtype = AXL_XFER_ASYNC_NNFDM;
#else
xtype = AXL_XFER_SYNC;
#endif
Expand Down Expand Up @@ -293,6 +300,11 @@ int AXL_Finalize (void)
}
#endif

#ifdef HAVE_NNFDM
axl_async_finalize_nnfdm();
#endif


/* decrement reference count and free data structures on last call */
axl_init_count--;
if (axl_init_count == 0) {
Expand Down Expand Up @@ -688,6 +700,15 @@ int AXL_Create(axl_xfer_t xtype, const char* name, const char* state_file)
#endif /* HAVE_DATAWARP */
break;

case AXL_XFER_ASYNC_NNFDM:
#ifndef HAVE_NNFDM
AXL_ERR("NNFDM requested but not enabled during build");
rc = AXL_FAILURE;
#else
axl_async_init_nnfdm();
#endif /* HAVE_NNFDM */
break;

default:
AXL_ERR("Unknown transfer type (%d)", (int) xtype);
rc = AXL_FAILURE;
Expand Down Expand Up @@ -826,6 +847,11 @@ static int __AXL_Add (int id, const char* src, const char* dest)
break;
#endif /* HAVE_DATAWARP */

#ifdef HAVE_NNFDM
case AXL_XFER_ASYNC_NNFDM:
break;
#endif /* HAVE_NNFDM */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's include a comment here about why we don't have to do anything.


default:
AXL_ERR("Unknown transfer type (%d)", (int) xtype);
rc = AXL_FAILURE;
Expand Down Expand Up @@ -1146,6 +1172,17 @@ int __AXL_Dispatch (int id, int resume)
break;
#endif /* HAVE_DATAWARP */

#ifdef HAVE_NNFDM
case AXL_XFER_ASYNC_NNFDM:
if (resume) {
AXL_ERR("AXL_Resume() isn't supported yet for NNFDM");
rc = AXL_FAILURE;
break;
}
rc = axl_async_start_nnfdm(id);
break;
#endif /* HAVE_NNFDM */

default:
AXL_ERR("Unknown transfer type (%d)", (int) xtype);
rc = AXL_FAILURE;
Expand Down Expand Up @@ -1297,6 +1334,12 @@ int AXL_Test (int id)
break;
#endif /* HAVE_DATAWARP */

#ifdef HAVE_NNFDM
case AXL_XFER_ASYNC_NNFDM:
rc = axl_async_test_nnfdm(id);
break;
#endif /* HAVE_NNFDM */

default:
AXL_ERR("Unknown transfer type (%d)", (int) xtype);
rc = AXL_FAILURE;
Expand Down Expand Up @@ -1367,6 +1410,12 @@ int AXL_Wait (int id)
break;
#endif /* HAVE_DATAWARP */

#ifdef HAVE_NNFDM
case AXL_XFER_ASYNC_NNFDM:
rc = axl_async_wait_nnfdm(id);
break;
#endif /* HAVE_NNDFM */

default:
AXL_ERR("Unknown transfer type (%d)", (int) xtype);
rc = AXL_FAILURE;
Expand Down Expand Up @@ -1465,6 +1514,12 @@ int AXL_Cancel (int id)
break;
#endif

#if 0
case AXL_XFER_ASYNC_NNFDM:
rc = axl_async_cancel_nnfdm(id);
break;
#endif

default:
AXL_ERR("Unknown transfer type (%d)", (int) xtype);
rc = AXL_FAILURE;
Expand Down
1 change: 1 addition & 0 deletions src/axl.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ typedef enum {
AXL_XFER_SYNC, /* synchronous copy */
AXL_XFER_ASYNC_DAEMON, /* async daemon process (not used, but kept to maintain enum values) */
AXL_XFER_ASYNC_DW, /* Cray Datawarp */
AXL_XFER_ASYNC_NNFDM, /* HPE DataMover */
AXL_XFER_ASYNC_BBAPI, /* IBM Burst Buffer API */
AXL_XFER_NATIVE, /* Autodetect and use the native API (BBAPI, DW,
* etc) for this node type. It may or may not
Expand Down
Loading