diff --git a/SIS/MPI/LSF_submission_scripts/mpi_no_container_single_node_n4_no_cuda_tile2.sh b/SIS/MPI/LSF_submission_scripts/mpi_no_container_single_node_n4_no_cuda_tile2.sh new file mode 100644 index 0000000..d10993a --- /dev/null +++ b/SIS/MPI/LSF_submission_scripts/mpi_no_container_single_node_n4_no_cuda_tile2.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +#BSUB -o mpi_no_container_single_node_n4_no_cuda_tile2.out +#BSUB -J mpi_no_container_single_node_n4_no_cuda_tile2 +#BSUB -n 4 +#BSUB -R "span[ptile=2]" + +mpirun --mca mpi_cuda_support 0 mpi_hello_world \ No newline at end of file diff --git a/SIS/MPI/LSF_submission_scripts/mpi_singul_v1103_single_node_n4_no_cuda_tile2.sh b/SIS/MPI/LSF_submission_scripts/mpi_singul_v1103_single_node_n4_no_cuda_tile2.sh new file mode 100644 index 0000000..c5edc80 --- /dev/null +++ b/SIS/MPI/LSF_submission_scripts/mpi_singul_v1103_single_node_n4_no_cuda_tile2.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +#BSUB -o mpi_singul_v1103_single_node_n4_no_cuda_tile2.out +#BSUB -J mpi_singul_v1103_single_node_n4_no_cuda_tile2 +#BSUB -n 4 +#BSUB -R "span[ptile=2]" + +mpirun --mca mpi_cuda_support 0 singularity exec mpi_hello_world-1103.img /mpi_hello_world/mpi_hello_world \ No newline at end of file diff --git a/SIS/MPI/LSF_submission_scripts/mpi_singul_v210_single_node_n4_no_cuda_tile2.sh b/SIS/MPI/LSF_submission_scripts/mpi_singul_v210_single_node_n4_no_cuda_tile2.sh new file mode 100644 index 0000000..6fd9ccd --- /dev/null +++ b/SIS/MPI/LSF_submission_scripts/mpi_singul_v210_single_node_n4_no_cuda_tile2.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +#BSUB -o mpi_singul_v210_single_node_n4_no_cuda_tile2.out +#BSUB -J mpi_singul_v210_single_node_n4_no_cuda_tile2 +#BSUB -n 4 +#BSUB -R "span[ptile=2]" + + +mpirun --mca mpi_cuda_support 0 singularity exec mpi_hello_world-210.img /mpi_hello_world/mpi_hello_world \ No newline at end of file diff --git a/SIS/MPI/notes.txt b/SIS/MPI/notes.txt index ab73e44..e4db098 100644 --- a/SIS/MPI/notes.txt +++ b/SIS/MPI/notes.txt @@ -1,566 +1,601 @@ ## MPIRUN INSIDE CONTAINER ## CONTAINER 1.10.3 [balazsl@lo-login-01]$ singularity exec mpi_hello_world-1103.img mpirun -n 4 /mpi_hello_world/mpi_hello_world /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) lo-login-01.29595hfi_wait_for_device: The /dev/hfi1_0 device failed to appear after 15.0 seconds: Connection timed out lo-login-01.29593hfi_wait_for_device: The /dev/hfi1_0 device failed to appear after 15.0 seconds: Connection timed out lo-login-01.29594hfi_wait_for_device: The /dev/hfi1_0 device failed to appear after 15.0 seconds: Connection timed out lo-login-01.29596hfi_wait_for_device: The /dev/hfi1_0 device failed to appear after 15.0 seconds: Connection timed out -------------------------------------------------------------------------- [[3328,1],2]: A high-performance Open MPI point-to-point messaging module was unable to find any relevant network interfaces: Module: OpenFabrics (openib) Host: lo-login-01 Another transport will be used instead, although this may result in lower performance. -------------------------------------------------------------------------- Hello world from host lo-login-01 core 8, processor rank 2 out of 4 processors Hello world from host lo-login-01 core 19, processor rank 0 out of 4 processors Hello world from host lo-login-01 core 29, processor rank 1 out of 4 processors Hello world from host lo-login-01 core 11, processor rank 3 out of 4 processors [lo-login-01:29584] 3 more processes have sent help message help-mpi-btl-base.txt / btl:no-nics [lo-login-01:29584] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages ## CONTAINER 2.1.0 [balazsl@lo-login-01]$ singularity exec mpi_hello_world-210.img mpirun -n 4 /mpi_hello_world/mpi_hello_world /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) Hello world from host lo-login-01 core 8, processor rank 2 out of 4 processors Hello world from host lo-login-01 core 22, processor rank 0 out of 4 processors Hello world from host lo-login-01 core 29, processor rank 1 out of 4 processors Hello world from host lo-login-01 core 11, processor rank 3 out of 4 processors ## MPIRUN OUTSIDE CONTAINER ## CONTAINER 1.10.3 [balazsl@lo-login-01]$ mpirun -n 4 singularity exec mpi_hello_world-1103.img /mpi_hello_world/mpi_hello_world /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) -------------------------------------------------------------------------- A requested component was not found, or was unable to be opened. This means that this component is either not installed or is unable to be used on your system (e.g., sometimes this means that shared libraries that the component requires are unable to be found/loaded). Note that Open MPI stopped checking at the first component that it did not find. Host: lo-login-01 Framework: ess Component: pmi -------------------------------------------------------------------------- -------------------------------------------------------------------------- A requested component was not found, or was unable to be opened. This means that this component is either not installed or is unable to be used on your system (e.g., sometimes this means that shared libraries that the component requires are unable to be found/loaded). Note that Open MPI stopped checking at the first component that it did not find. Host: lo-login-01 Framework: ess Component: pmi -------------------------------------------------------------------------- [lo-login-01:30037] [[INVALID],INVALID] ORTE_ERROR_LOG: Not found in file runtime/orte_init.c at line 129 [lo-login-01:30039] [[INVALID],INVALID] ORTE_ERROR_LOG: Not found in file runtime/orte_init.c at line 129 -------------------------------------------------------------------------- It looks like orte_init failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during orte_init; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): orte_ess_base_open failed --> Returned value Not found (-13) instead of ORTE_SUCCESS -------------------------------------------------------------------------- -------------------------------------------------------------------------- It looks like orte_init failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during orte_init; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): orte_ess_base_open failed --> Returned value Not found (-13) instead of ORTE_SUCCESS -------------------------------------------------------------------------- -------------------------------------------------------------------------- It looks like MPI_INIT failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during MPI_INIT; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): ompi_mpi_init: ompi_rte_init failed --> Returned "Not found" (-13) instead of "Success" (0) -------------------------------------------------------------------------- -------------------------------------------------------------------------- It looks like MPI_INIT failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during MPI_INIT; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): ompi_mpi_init: ompi_rte_init failed --> Returned "Not found" (-13) instead of "Success" (0) -------------------------------------------------------------------------- *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) [lo-login-01:30037] Local abort before MPI_INIT completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed! [lo-login-01:30039] Local abort before MPI_INIT completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed! -------------------------------------------------------------------------- A requested component was not found, or was unable to be opened. This means that this component is either not installed or is unable to be used on your system (e.g., sometimes this means that shared libraries that the component requires are unable to be found/loaded). Note that Open MPI stopped checking at the first component that it did not find. Host: lo-login-01 Framework: ess Component: pmi -------------------------------------------------------------------------- -------------------------------------------------------------------------- A requested component was not found, or was unable to be opened. This means that this component is either not installed or is unable to be used on your system (e.g., sometimes this means that shared libraries that the component requires are unable to be found/loaded). Note that Open MPI stopped checking at the first component that it did not find. Host: lo-login-01 Framework: ess Component: pmi -------------------------------------------------------------------------- [lo-login-01:30038] [[INVALID],INVALID] ORTE_ERROR_LOG: Not found in file runtime/orte_init.c at line 129 [lo-login-01:30040] [[INVALID],INVALID] ORTE_ERROR_LOG: Not found in file runtime/orte_init.c at line 129 -------------------------------------------------------------------------- It looks like orte_init failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during orte_init; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): orte_ess_base_open failed --> Returned value Not found (-13) instead of ORTE_SUCCESS -------------------------------------------------------------------------- -------------------------------------------------------------------------- It looks like orte_init failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during orte_init; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): orte_ess_base_open failed --> Returned value Not found (-13) instead of ORTE_SUCCESS -------------------------------------------------------------------------- *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) [lo-login-01:30040] Local abort before MPI_INIT completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed! *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) [lo-login-01:30038] Local abort before MPI_INIT completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed! -------------------------------------------------------------------------- It looks like MPI_INIT failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during MPI_INIT; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): ompi_mpi_init: ompi_rte_init failed --> Returned "Not found" (-13) instead of "Success" (0) -------------------------------------------------------------------------- -------------------------------------------------------------------------- It looks like MPI_INIT failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during MPI_INIT; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): ompi_mpi_init: ompi_rte_init failed --> Returned "Not found" (-13) instead of "Success" (0) -------------------------------------------------------------------------- ------------------------------------------------------- Primary job terminated normally, but 1 process returned a non-zero exit code.. Per user-direction, the job has been aborted. ------------------------------------------------------- -------------------------------------------------------------------------- mpirun detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was: Process name: [[3008,1],0] Exit code: 1 -------------------------------------------------------------------------- ## CONTAINER 2.1.0 [balazsl@lo-login-01]$ mpirun -n 4 singularity exec mpi_hello_world-210.img /mpi_hello_world/mpi_hello_world /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) Hello world from host lo-login-01 core 11, processor rank 3 out of 4 processors Hello world from host lo-login-01 core 19, processor rank 0 out of 4 processors Hello world from host lo-login-01 core 8, processor rank 2 out of 4 processors Hello world from host lo-login-01 core 11, processor rank 1 out of 4 processors ## COMPUTE NODE [balazsl@lo-login-01]$ bsub -Is -J 'testing Singularity and MPI' -n 4 -W 1:00 bash Generic job. Job <9990> is submitted to queue . <> <> ## MPIRUN INSIDE CONTAINER ## CONTAINER 1.10.3 [balazsl@lo-a2-028]$ singularity exec mpi_hello_world-1103.img mpirun -n 4 /mpi_hello_world/mpi_hello_world /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) lo-a2-028.6452hfi_wait_for_device: The /dev/hfi1_0 device failed to appear after 15.0 seconds: Connection timed out lo-a2-028.6449hfi_wait_for_device: The /dev/hfi1_0 device failed to appear after 15.0 seconds: Connection timed out lo-a2-028.6450hfi_wait_for_device: The /dev/hfi1_0 device failed to appear after 15.0 seconds: Connection timed out lo-a2-028.6451hfi_wait_for_device: The /dev/hfi1_0 device failed to appear after 15.0 seconds: Connection timed out -------------------------------------------------------------------------- [[21509,1],3]: A high-performance Open MPI point-to-point messaging module was unable to find any relevant network interfaces: Module: OpenFabrics (openib) Host: lo-a2-028 Another transport will be used instead, although this may result in lower performance. -------------------------------------------------------------------------- Hello world from host lo-a2-028 core 4, processor rank 3 out of 4 processors Hello world from host lo-a2-028 core 39, processor rank 0 out of 4 processors Hello world from host lo-a2-028 core 42, processor rank 1 out of 4 processors Hello world from host lo-a2-028 core 6, processor rank 2 out of 4 processors [lo-a2-028:06440] 3 more processes have sent help message help-mpi-btl-base.txt / btl:no-nics [lo-a2-028:06440] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages ## CONTAINER 2.1.0 [balazsl@lo-a2-028]$ singularity exec mpi_hello_world-210.img mpirun -n 4 /mpi_hello_world/mpi_hello_world /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) Hello world from host lo-a2-028 core 40, processor rank 1 out of 4 processors Hello world from host lo-a2-028 core 5, processor rank 0 out of 4 processors Hello world from host lo-a2-028 core 6, processor rank 3 out of 4 processors Hello world from host lo-a2-028 core 41, processor rank 2 out of 4 processors ## MPIRUN OUTSIDE THE CONTAINER ## CONTAINER 1.10.3 [balazsl@lo-a2-028]$ mpirun -n 4 singularity exec mpi_hello_world-1103.img /mpi_hello_world/mpi_hello_world /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) -------------------------------------------------------------------------- A requested component was not found, or was unable to be opened. This means that this component is either not installed or is unable to be used on your system (e.g., sometimes this means that shared libraries that the component requires are unable to be found/loaded). Note that Open MPI stopped checking at the first component that it did not find. Host: lo-a2-028 Framework: ess Component: pmi -------------------------------------------------------------------------- [lo-a2-028:23089] [[INVALID],INVALID] ORTE_ERROR_LOG: Not found in file runtime/orte_init.c at line 129 -------------------------------------------------------------------------- It looks like orte_init failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during orte_init; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): orte_ess_base_open failed --> Returned value Not found (-13) instead of ORTE_SUCCESS -------------------------------------------------------------------------- *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) [lo-a2-028:23089] Local abort before MPI_INIT completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed! -------------------------------------------------------------------------- It looks like MPI_INIT failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during MPI_INIT; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): ompi_mpi_init: ompi_rte_init failed --> Returned "Not found" (-13) instead of "Success" (0) -------------------------------------------------------------------------- -------------------------------------------------------------------------- A requested component was not found, or was unable to be opened. This means that this component is either not installed or is unable to be used on your system (e.g., sometimes this means that shared libraries that the component requires are unable to be found/loaded). Note that Open MPI stopped checking at the first component that it did not find. Host: lo-a2-028 Framework: ess Component: pmi -------------------------------------------------------------------------- [lo-a2-028:23087] [[INVALID],INVALID] ORTE_ERROR_LOG: Not found in file runtime/orte_init.c at line 129 -------------------------------------------------------------------------- A requested component was not found, or was unable to be opened. This means that this component is either not installed or is unable to be used on your system (e.g., sometimes this means that shared libraries that the component requires are unable to be found/loaded). Note that Open MPI stopped checking at the first component that it did not find. Host: lo-a2-028 Framework: ess Component: pmi -------------------------------------------------------------------------- -------------------------------------------------------------------------- It looks like orte_init failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during orte_init; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): orte_ess_base_open failed --> Returned value Not found (-13) instead of ORTE_SUCCESS -------------------------------------------------------------------------- [lo-a2-028:23086] [[INVALID],INVALID] ORTE_ERROR_LOG: Not found in file runtime/orte_init.c at line 129 *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) [lo-a2-028:23087] Local abort before MPI_INIT completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed! -------------------------------------------------------------------------- It looks like orte_init failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during orte_init; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): orte_ess_base_open failed --> Returned value Not found (-13) instead of ORTE_SUCCESS -------------------------------------------------------------------------- -------------------------------------------------------------------------- It looks like MPI_INIT failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during MPI_INIT; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): ompi_mpi_init: ompi_rte_init failed --> Returned "Not found" (-13) instead of "Success" (0) -------------------------------------------------------------------------- -------------------------------------------------------------------------- It looks like MPI_INIT failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during MPI_INIT; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): ompi_mpi_init: ompi_rte_init failed --> Returned "Not found" (-13) instead of "Success" (0) -------------------------------------------------------------------------- *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) [lo-a2-028:23086] Local abort before MPI_INIT completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed! -------------------------------------------------------------------------- A requested component was not found, or was unable to be opened. This means that this component is either not installed or is unable to be used on your system (e.g., sometimes this means that shared libraries that the component requires are unable to be found/loaded). Note that Open MPI stopped checking at the first component that it did not find. Host: lo-a2-028 Framework: ess Component: pmi -------------------------------------------------------------------------- [lo-a2-028:23088] [[INVALID],INVALID] ORTE_ERROR_LOG: Not found in file runtime/orte_init.c at line 129 -------------------------------------------------------------------------- It looks like orte_init failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during orte_init; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): orte_ess_base_open failed --> Returned value Not found (-13) instead of ORTE_SUCCESS -------------------------------------------------------------------------- -------------------------------------------------------------------------- It looks like MPI_INIT failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during MPI_INIT; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): ompi_mpi_init: ompi_rte_init failed --> Returned "Not found" (-13) instead of "Success" (0) -------------------------------------------------------------------------- *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) [lo-a2-028:23088] Local abort before MPI_INIT completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed! ------------------------------------------------------- Primary job terminated normally, but 1 process returned a non-zero exit code.. Per user-direction, the job has been aborted. ------------------------------------------------------- -------------------------------------------------------------------------- mpirun detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was: Process name: [[5898,1],3] Exit code: 1 -------------------------------------------------------------------------- ## CONTAINER 2.1.0 [balazsl@lo-a2-028]$ mpirun -n 4 singularity exec mpi_hello_world-210.img /mpi_hello_world/mpi_hello_world /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) -------------------------------------------------------------------------- It looks like MPI_INIT failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during MPI_INIT; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): ompi_mpi_init: ompi_rte_init failed --> Returned "(null)" (-43) instead of "Success" (0) -------------------------------------------------------------------------- -------------------------------------------------------------------------- It looks like MPI_INIT failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during MPI_INIT; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): ompi_mpi_init: ompi_rte_init failed --> Returned "(null)" (-43) instead of "Success" (0) -------------------------------------------------------------------------- *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) [lo-a2-028:23176] Local abort before MPI_INIT completed completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed! *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) [lo-a2-028:23175] Local abort before MPI_INIT completed completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed! -------------------------------------------------------------------------- It looks like MPI_INIT failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during MPI_INIT; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): ompi_mpi_init: ompi_rte_init failed --> Returned "(null)" (-43) instead of "Success" (0) -------------------------------------------------------------------------- *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) [lo-a2-028:23178] Local abort before MPI_INIT completed completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed! ------------------------------------------------------- Primary job terminated normally, but 1 process returned a non-zero exit code.. Per user-direction, the job has been aborted. ------------------------------------------------------- -------------------------------------------------------------------------- It looks like MPI_INIT failed for some reason; your parallel process is likely to abort. There are many reasons that a parallel process can fail during MPI_INIT; some of which are due to configuration or environment problems. This failure appears to be an internal failure; here's some additional information (which may only be relevant to an Open MPI developer): ompi_mpi_init: ompi_rte_init failed --> Returned "(null)" (-43) instead of "Success" (0) -------------------------------------------------------------------------- *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) [lo-a2-028:23177] Local abort before MPI_INIT completed completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed! -------------------------------------------------------------------------- mpirun detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was: Process name: [[6063,1],0] Exit code: 1 -------------------------------------------------------------------------- ## LSF SUBMISSION [balazsl@lo-login-01]$ bsub < LSF_submission_scripts/mpi_singul_v1103_single_node_n4.sh MPI job. Job <9993> is submitted to queue . [balazsl@lo-login-01]$ bsub < LSF_submission_scripts/mpi_singul_v210_single_node_n4.sh MPI job. Job <9994> is submitted to queue . [balazsl@lo-login-01]$ bsub < LSF_submission_scripts/mpi_no_container_single_node_n4.sh MPI job. Job <9995> is submitted to queue . [balazsl@lo-login-01]$ bjobs -w JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME SUBMIT_TIME 9993 balazsl PEND normal.4h lo-login-01 - mpi_singul_v1103_single_node_n4 Aug 22 17:10 9994 balazsl PEND normal.4h lo-login-01 - mpi_singul_v210_single_node_n4 Aug 22 17:10 9995 balazsl PEND normal.4h lo-login-01 - mpi_no_container_single_node_n4 Aug 22 17:10 -## SCRIPT SUBMISSION +## SCRIPT SUBMISSION TEST 01 [balazsl@lo-login-01]$ ./launch_LSF_jobs.sh Submitting LSF_submission_scripts/mpi_no_container_single_node_n4_no_cuda.sh ... MPI job. Job <9996> is submitted to queue . Submitting LSF_submission_scripts/mpi_no_container_single_node_n4.sh ... MPI job. Job <9997> is submitted to queue . Submitting LSF_submission_scripts/mpi_singul_v1103_single_node_n4_no_cuda.sh ... MPI job. Job <9998> is submitted to queue . Submitting LSF_submission_scripts/mpi_singul_v1103_single_node_n4.sh ... MPI job. Job <9999> is submitted to queue . Submitting LSF_submission_scripts/mpi_singul_v210_single_node_n4_no_cuda.sh ... MPI job. Job <10000> is submitted to queue . Submitting LSF_submission_scripts/mpi_singul_v210_single_node_n4.sh ... MPI job. Job <10001> is submitted to queue . JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME SUBMIT_TIME 9996 balazsl PEND normal.4h lo-login-01 - mpi_no_container_single_node_n4_no_cuda Aug 22 17:22 9997 balazsl PEND normal.4h lo-login-01 - mpi_no_container_single_node_n4 Aug 22 17:22 9998 balazsl PEND normal.4h lo-login-01 - mpi_singul_v1103_single_node_n4_no_cuda Aug 22 17:22 9999 balazsl PEND normal.4h lo-login-01 - mpi_singul_v1103_single_node_n4 Aug 22 17:22 10000 balazsl PEND normal.4h lo-login-01 - mpi_singul_v210_single_node_n4_no_cuda Aug 22 17:22 10001 balazsl PEND normal.4h lo-login-01 - mpi_singul_v210_single_node_n4 Aug 22 17:22 +## SAVING LOGS +[balazsl@lo-login-01]$ V=01 && mkdir test_$V && mv *.out test_$V + +## SCRIPT SUBMISSION TEST 02 +[balazsl@lo-login-01]$ ./launch_LSF_jobs.sh +Submitting LSF_submission_scripts/mpi_no_container_single_node_n4_no_cuda.sh ... +MPI job. +Job <10002> is submitted to queue . +Submitting LSF_submission_scripts/mpi_no_container_single_node_n4.sh ... +MPI job. +Job <10003> is submitted to queue . +Submitting LSF_submission_scripts/mpi_singul_v1103_single_node_n4_no_cuda.sh ... +MPI job. +Job <10004> is submitted to queue . +Submitting LSF_submission_scripts/mpi_singul_v1103_single_node_n4.sh ... +MPI job. +Job <10005> is submitted to queue . +Submitting LSF_submission_scripts/mpi_singul_v210_single_node_n4_no_cuda.sh ... +MPI job. +Job <10006> is submitted to queue . +Submitting LSF_submission_scripts/mpi_singul_v210_single_node_n4.sh ... +MPI job. +Job <10007> is submitted to queue . +JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME SUBMIT_TIME +10002 balazsl PEND normal.4h lo-login-01 - mpi_no_container_single_node_n4_no_cuda Aug 22 17:27 +10003 balazsl PEND normal.4h lo-login-01 - mpi_no_container_single_node_n4 Aug 22 17:27 +10004 balazsl PEND normal.4h lo-login-01 - mpi_singul_v1103_single_node_n4_no_cuda Aug 22 17:27 +10005 balazsl PEND normal.4h lo-login-01 - mpi_singul_v1103_single_node_n4 Aug 22 17:27 +10006 balazsl PEND normal.4h lo-login-01 - mpi_singul_v210_single_node_n4_no_cuda Aug 22 17:27 +10007 balazsl PEND normal.4h lo-login-01 - mpi_singul_v210_single_node_n4 Aug 22 17:27 + +# SAVING LOGS +V=02 && mkdir test_$V && mv *.out test_$V + +## SCRIPT SUBMISSION TEST 03 \ No newline at end of file