net/mpich: unbreak optimized runtime after 88e134883d
Runtime may fail without a L0 driver like intel-compute-runtime e.g., $ mpivars Abort(268484367) on node 0 (rank 0 in comm 0): Fatal error in PMPI_Init_thread: Other MPI error, error stack: MPIR_Init_thread(153): gpu_init failed [unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=268484367 : system msg for write_line failure : Bad file descriptor Attempting to use an MPI routine before initializing MPICH $ MPIR_CVAR_ENABLE_GPU=0 mpivars Abort(2139535) on node 0 (rank 0 in comm 0): Fatal error in PMPI_Init_thread: Other MPI error, error stack: MPIR_Init_thread(159)......: MPID_Init(591).............: MPIDI_SHM_mpi_init_hook(22): MPIDI_IPC_mpi_init_hook(36): MPIDI_GPU_mpi_init_hook(79): gpu_get_dev_count failed [unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=2139535 : system msg for write_line failure : Bad file descriptor Abort(2139535) on node 0 (rank 0 in comm 0): Fatal error in PMPI_Init_thread: Other MPI error, error stack: MPIR_Init_thread(159)......: MPID_Init(591).............: MPIDI_SHM_mpi_init_hook(22): MPIDI_IPC_mpi_init_hook(36): MPIDI_GPU_mpi_init_hook(79): gpu_get_dev_count failed [unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=2139535 : system msg for write_line failure : Bad file descriptor Segmentation fault PR: 256244 (for tracking)
This commit is contained in:
parent
ea525437cd
commit
b5815e7648
2 changed files with 45 additions and 1 deletions
|
@ -1,6 +1,6 @@
|
||||||
PORTNAME= mpich
|
PORTNAME= mpich
|
||||||
PORTVERSION= 3.4.2
|
PORTVERSION= 3.4.2
|
||||||
PORTREVISION= 2
|
PORTREVISION= 3
|
||||||
CATEGORIES= net parallel
|
CATEGORIES= net parallel
|
||||||
MASTER_SITES= https://www.mpich.org/static/downloads/${DISTVERSION}/
|
MASTER_SITES= https://www.mpich.org/static/downloads/${DISTVERSION}/
|
||||||
|
|
||||||
|
|
44
net/mpich/files/patch-l0-fallback
Normal file
44
net/mpich/files/patch-l0-fallback
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
$ pkg delete intel-compute-runtime
|
||||||
|
$ mpivars
|
||||||
|
PCI: Failed to initialize libpciaccess with pci_system_init(): 6 (Permission denied)
|
||||||
|
Abort(268484367) on node 0 (rank 0 in comm 0): Fatal error in PMPI_Init_thread: Other MPI error, error stack:
|
||||||
|
MPIR_Init_thread(153): gpu_init failed
|
||||||
|
[unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=268484367
|
||||||
|
:
|
||||||
|
system msg for write_line failure : Bad file descriptor
|
||||||
|
Attempting to use an MPI routine before initializing MPICH
|
||||||
|
|
||||||
|
--- src/mpi/init/initthread.c.orig 2021-05-25 17:37:05 UTC
|
||||||
|
+++ src/mpi/init/initthread.c
|
||||||
|
@@ -150,7 +150,9 @@ int MPIR_Init_thread(int *argc, char ***argv, int user
|
||||||
|
* inside MPID_Init */
|
||||||
|
if (MPIR_CVAR_ENABLE_GPU) {
|
||||||
|
int mpl_errno = MPL_gpu_init();
|
||||||
|
- MPIR_ERR_CHKANDJUMP(mpl_errno != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_init");
|
||||||
|
+ MPIR_ERR_CHKANDJUMP(
|
||||||
|
+ mpl_errno != MPL_SUCCESS && mpl_errno != MPL_ERR_GPU_INTERNAL,
|
||||||
|
+ mpi_errno, MPI_ERR_OTHER, "**gpu_init");
|
||||||
|
}
|
||||||
|
|
||||||
|
MPL_atomic_store_int(&MPIR_Process.mpich_state, MPICH_MPI_STATE__IN_INIT);
|
||||||
|
--- src/mpid/ch4/netmod/ofi/ofi_init.c.orig 2021-05-25 17:37:05 UTC
|
||||||
|
+++ src/mpid/ch4/netmod/ofi/ofi_init.c
|
||||||
|
@@ -731,7 +731,6 @@ int MPIDI_OFI_mpi_init_hook(int rank, int size, int ap
|
||||||
|
MPL_gpu_malloc_host(&(MPIDI_OFI_global.am_bufs[i]), MPIDI_OFI_AM_BUFF_SZ);
|
||||||
|
MPIDI_OFI_global.am_reqs[i].event_id = MPIDI_OFI_EVENT_AM_RECV;
|
||||||
|
MPIDI_OFI_global.am_reqs[i].index = i;
|
||||||
|
- MPIR_Assert(MPIDI_OFI_global.am_bufs[i]);
|
||||||
|
MPIDI_OFI_global.am_iov[i].iov_base = MPIDI_OFI_global.am_bufs[i];
|
||||||
|
MPIDI_OFI_global.am_iov[i].iov_len = MPIDI_OFI_AM_BUFF_SZ;
|
||||||
|
MPIDI_OFI_global.am_msg[i].msg_iov = &MPIDI_OFI_global.am_iov[i];
|
||||||
|
--- src/mpl/src/gpu/mpl_gpu_ze.c.orig 2021-05-25 17:37:05 UTC
|
||||||
|
+++ src/mpl/src/gpu/mpl_gpu_ze.c
|
||||||
|
@@ -33,7 +33,7 @@ int MPL_gpu_get_dev_count(int *dev_cnt, int *dev_id)
|
||||||
|
{
|
||||||
|
int ret = MPL_SUCCESS;
|
||||||
|
if (!gpu_initialized) {
|
||||||
|
- ret = MPL_gpu_init();
|
||||||
|
+ MPL_gpu_init();
|
||||||
|
}
|
||||||
|
|
||||||
|
*dev_cnt = device_count;
|
Loading…
Reference in a new issue