diff --git a/perf_tests/common/include/ze_app.hpp b/perf_tests/common/include/ze_app.hpp index 0412f783..3eb53793 100644 --- a/perf_tests/common/include/ze_app.hpp +++ b/perf_tests/common/include/ze_app.hpp @@ -50,6 +50,10 @@ class ZeApp { void commandListCreate(uint32_t device_index, uint32_t command_queue_group_ordinal, ze_command_list_handle_t *phCommandList); + void immediateCommandListCreate(uint32_t device_index, + uint32_t command_queue_group_ordinal, + uint32_t command_queue_index, + ze_command_list_handle_t *phCommandList); void commandListDestroy(ze_command_list_handle_t phCommandList); void commandListClose(ze_command_list_handle_t phCommandList); void commandListReset(ze_command_list_handle_t phCommandList); diff --git a/perf_tests/common/src/ze_app.cpp b/perf_tests/common/src/ze_app.cpp index 2a14e0e1..ff4f24ed 100644 --- a/perf_tests/common/src/ze_app.cpp +++ b/perf_tests/common/src/ze_app.cpp @@ -234,6 +234,19 @@ void ZeApp::commandListCreate(uint32_t device_index, phCommandList)); } +void ZeApp::immediateCommandListCreate( + uint32_t device_index, uint32_t command_queue_group_ordinal, + uint32_t command_queue_index, ze_command_list_handle_t *phCommandList) { + ze_command_queue_desc_t command_queue_description{}; + command_queue_description.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC; + command_queue_description.pNext = nullptr; + command_queue_description.ordinal = command_queue_group_ordinal; + command_queue_description.index = command_queue_index; + SUCCESS_OR_TERMINATE( + zeCommandListCreateImmediate(context, _devices[device_index], + &command_queue_description, phCommandList)); +} + void ZeApp::commandListDestroy(ze_command_list_handle_t command_list) { SUCCESS_OR_TERMINATE(zeCommandListDestroy(command_list)); } diff --git a/perf_tests/ze_peer/include/ze_peer.h b/perf_tests/ze_peer/include/ze_peer.h index 84428444..1b8563cf 100644 --- a/perf_tests/ze_peer/include/ze_peer.h +++ b/perf_tests/ze_peer/include/ze_peer.h @@ -126,6 +126,7 @@ static const char *usage_str = "divide " "buffers across available" "\n engines specified with option -u." + "\n" "\n -x for unidirectional parallel tests, select " "where to place the queue" "\n src use queue in source" @@ -136,6 +137,9 @@ static const char *usage_str = "\n with each device being managed by a " "separate process." "\n" + "\n --regular_cmdlist use regular command list instead of " + "immediate" + "\n" "\n --version display version" "\n -h, --help display help message" "\n"; @@ -155,10 +159,18 @@ class ZePeer { uint32_t local_device_id, size_t buffer_size); + void perform_parallel_copy_to_single_target_immediate( + peer_test_t test_type, peer_transfer_t transfer_type, + uint32_t remote_device_id, uint32_t local_device_id, size_t buffer_size); + void perform_bidirectional_parallel_copy_to_single_target( peer_test_t test_type, peer_transfer_t transfer_type, uint32_t remote_device_id, uint32_t local_device_id, size_t buffer_size); + void perform_bidirectional_parallel_copy_to_single_target_immediate( + peer_test_t test_type, peer_transfer_t transfer_type, + uint32_t remote_device_id, uint32_t local_device_id, size_t buffer_size); + void bandwidth_latency_parallel_to_single_target( peer_test_t test_type, peer_transfer_t transfer_type, size_t number_buffer_elements, uint32_t remote_device_id, @@ -170,22 +182,44 @@ class ZePeer { std::vector &local_device_ids, size_t buffer_size, bool divide_buffers); + void perform_parallel_copy_to_multiple_targets_immediate( + peer_test_t test_type, peer_transfer_t transfer_type, + std::vector &remote_device_ids, + std::vector &local_device_ids, size_t buffer_size, + bool divide_buffers); + void perform_bidirectional_parallel_copy_to_multiple_targets( peer_test_t test_type, peer_transfer_t transfer_type, std::vector &remote_device_ids, std::vector &local_device_ids, size_t buffer_size, bool divide_buffers); + void perform_bidirectional_parallel_copy_to_multiple_targets_immediate( + peer_test_t test_type, peer_transfer_t transfer_type, + std::vector &remote_device_ids, + std::vector &local_device_ids, size_t buffer_size, + bool divide_buffers); + void perform_parallel_copy_to_pair_targets( peer_test_t test_type, peer_transfer_t transfer_type, std::vector> &pair_device_ids, size_t buffer_size, bool divide_buffers); + void perform_parallel_copy_to_pair_targets_immediate( + peer_test_t test_type, peer_transfer_t transfer_type, + std::vector> &pair_device_ids, + size_t buffer_size, bool divide_buffers); + void perform_bidirectional_parallel_copy_to_pair_targets( peer_test_t test_type, peer_transfer_t transfer_type, std::vector> &pair_device_ids, size_t buffer_size, bool divide_buffers); + void perform_bidirectional_parallel_copy_to_pair_targets_immediate( + peer_test_t test_type, peer_transfer_t transfer_type, + std::vector> &pair_device_ids, + size_t buffer_size, bool divide_buffers); + void bandwidth_latency_parallel_to_pair_targets( peer_test_t test_type, peer_transfer_t transfer_type, size_t number_buffer_elements, @@ -214,17 +248,30 @@ class ZePeer { ze_command_queue_handle_t command_queue, void *dst_buffer, void *src_buffer, size_t buffer_size); + void perform_copy_immediate(peer_test_t test_type, + ze_command_list_handle_t command_list, + void *dst_buffer, void *src_buffer, + size_t buffer_size); + void bidirectional_perform_copy(uint32_t dst_device_id, uint32_t src_device_id, uint32_t queue_index, peer_test_t test_type, peer_transfer_t transfer_type, size_t buffer_size); + void bidirectional_perform_copy_immediate( + uint32_t remote_device_id, uint32_t local_device_id, uint32_t queue_index, + peer_test_t test_type, peer_transfer_t transfer_type, size_t buffer_size); + void initialize_src_buffer(ze_command_list_handle_t command_list, ze_command_queue_handle_t command_queue, void *local_buffer, char *host_buffer, size_t buffer_size); + void initialize_src_buffer_immediate(ze_command_list_handle_t command_list, + void *src_buffer, char *host_buffer, + size_t buffer_size); + void initialize_buffers(ze_command_list_handle_t command_list, ze_command_queue_handle_t command_queue, void *src_buffer, char *host_buffer, @@ -239,6 +286,10 @@ class ZePeer { char *validate_buffer, void *dst_buffer, char *host_buffer, size_t buffer_size); + void validate_buffer_immediate(ze_command_list_handle_t command_list, + char *validate_buffer, void *dst_buffer, + char *host_buffer, size_t buffer_size); + void set_up(size_t number_buffer_elements, std::vector &remote_device_ids, std::vector &local_device_ids, size_t &buffer_size); @@ -297,6 +348,7 @@ class ZePeer { static bool parallel_copy_to_multiple_targets; static bool parallel_copy_to_pair_targets; static bool parallel_divide_buffers; + static bool use_immediate_cmdlist; static uint32_t number_iterations; uint32_t warm_up_iterations = number_iterations / 5; diff --git a/perf_tests/ze_peer/src/ze_peer.cpp b/perf_tests/ze_peer/src/ze_peer.cpp index 8156cab1..5f6fb5b9 100644 --- a/perf_tests/ze_peer/src/ze_peer.cpp +++ b/perf_tests/ze_peer/src/ze_peer.cpp @@ -18,6 +18,7 @@ bool ZePeer::parallel_copy_to_single_target = false; bool ZePeer::parallel_copy_to_multiple_targets = false; bool ZePeer::parallel_copy_to_pair_targets = false; bool ZePeer::parallel_divide_buffers = false; +bool ZePeer::use_immediate_cmdlist = true; uint32_t ZePeer::number_iterations = 50; const size_t max_elems = 268435456; /* 256 MB */ @@ -457,6 +458,8 @@ int main(int argc, char **argv) { i++; } else if (strcmp(argv[i], "-v") == 0) { ZePeer::validate_results = true; + } else if (strcmp(argv[i], "--regular_cmdlist") == 0) { + ZePeer::use_immediate_cmdlist = false; } else { std::cout << usage_str; exit(-1); @@ -711,18 +714,25 @@ ZePeer::ZePeer(std::vector &remote_device_ids, uint32_t engineIndex = 0; for (uint32_t g = 0; g < numQueueGroups; g++) { for (uint32_t q = 0; q < queueProperties[g].numQueues; q++) { - ze_command_queue_handle_t command_queue; - benchmark->commandQueueCreate(d, g, q, &command_queue); + std::pair + enginePair; + if (ZePeer::use_immediate_cmdlist) { + ze_command_list_handle_t command_list; + benchmark->immediateCommandListCreate(d, g, q, &command_list); - ze_command_list_handle_t command_list; - benchmark->commandListCreate(d, g, &command_list); + enginePair = std::make_pair(nullptr, command_list); + } else { + ze_command_queue_handle_t command_queue; + benchmark->commandQueueCreate(d, g, q, &command_queue); - auto enginePair = std::make_pair(command_queue, command_list); - ze_peer_devices[d].engines.push_back(enginePair); + ze_command_list_handle_t command_list; + benchmark->commandListCreate(d, g, &command_list); + + enginePair = std::make_pair(command_queue, command_list); + } - // use compute engines by default. Select the indexes from device 0 - if (option_u_empty && (queueProperties[g].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) { + ze_peer_devices[d].engines.push_back(enginePair); + if (option_u_empty) { this->queues.push_back(engineIndex); } @@ -745,7 +755,9 @@ ZePeer::ZePeer(std::vector &remote_device_ids, ZePeer::~ZePeer() { for (auto &device : ze_peer_devices) { for (auto enginePair : device.engines) { - benchmark->commandQueueDestroy(enginePair.first); + if (enginePair.first) { + benchmark->commandQueueDestroy(enginePair.first); + } benchmark->commandListDestroy(enginePair.second); } } diff --git a/perf_tests/ze_peer/src/ze_peer_bidirectional.cpp b/perf_tests/ze_peer/src/ze_peer_bidirectional.cpp index 9bb113bc..cb5b8fd8 100644 --- a/perf_tests/ze_peer/src/ze_peer_bidirectional.cpp +++ b/perf_tests/ze_peer/src/ze_peer_bidirectional.cpp @@ -80,6 +80,75 @@ void ZePeer::bidirectional_perform_copy( SUCCESS_OR_TERMINATE(zeCommandListReset(remote_command_list)); } +void ZePeer::bidirectional_perform_copy_immediate( + uint32_t remote_device_id, uint32_t local_device_id, uint32_t queue_index, + peer_test_t test_type, peer_transfer_t transfer_type, size_t buffer_size) { + ze_command_list_handle_t local_command_list = + ze_peer_devices[local_device_id].engines[queue_index].second; + ze_command_list_handle_t remote_command_list = + ze_peer_devices[remote_device_id].engines[queue_index].second; + + Timer timer; + + /* Warm up */ + for (uint32_t i = 0U; i < warm_up_iterations; i++) { + if (transfer_type == PEER_WRITE) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + local_command_list, ze_dst_buffers[remote_device_id], + ze_src_buffers[local_device_id], buffer_size, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + remote_command_list, ze_dst_buffers[local_device_id], + ze_src_buffers[remote_device_id], buffer_size, nullptr, 1, &event)); + } else { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + local_command_list, ze_dst_buffers[local_device_id], + ze_src_buffers[remote_device_id], buffer_size, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + remote_command_list, ze_dst_buffers[remote_device_id], + ze_src_buffers[local_device_id], buffer_size, nullptr, 1, &event)); + } + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + local_command_list, std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + remote_command_list, std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + } + + do { + long double time_usec = 0; + for (uint32_t i = 0U; i < number_iterations; i++) { + if (transfer_type == PEER_WRITE) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + local_command_list, ze_dst_buffers[remote_device_id], + ze_src_buffers[local_device_id], buffer_size, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + remote_command_list, ze_dst_buffers[local_device_id], + ze_src_buffers[remote_device_id], buffer_size, nullptr, 1, &event)); + } else { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + local_command_list, ze_dst_buffers[local_device_id], + ze_src_buffers[remote_device_id], buffer_size, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + remote_command_list, ze_dst_buffers[remote_device_id], + ze_src_buffers[local_device_id], buffer_size, nullptr, 1, &event)); + } + + timer.start(); + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + local_command_list, std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + remote_command_list, std::numeric_limits::max())); + timer.end(); + time_usec += timer.period_minus_overhead(); + + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + } + print_results(true, test_type, buffer_size, time_usec); + } while (run_continuously); +} + void ZePeer::bidirectional_bandwidth_latency(peer_test_t test_type, peer_transfer_t transfer_type, size_t number_buffer_elements, @@ -98,23 +167,41 @@ void ZePeer::bidirectional_bandwidth_latency(peer_test_t test_type, initialize_buffers(remote_device_ids, local_device_ids, ze_host_buffer, buffer_size); - bidirectional_perform_copy(remote_device_id, local_device_id, queue_index, - test_type, transfer_type, buffer_size); + if (ZePeer::use_immediate_cmdlist) { + bidirectional_perform_copy_immediate(remote_device_id, local_device_id, + queue_index, test_type, transfer_type, + buffer_size); + } else { + bidirectional_perform_copy(remote_device_id, local_device_id, queue_index, + test_type, transfer_type, buffer_size); + } if (validate_results) { - validate_buffer(ze_peer_devices[remote_device_id].engines[0].second, - ze_peer_devices[remote_device_id].engines[0].first, - ze_host_validate_buffer, ze_dst_buffers[remote_device_id], - ze_host_buffer, buffer_size); - + if (ZePeer::use_immediate_cmdlist) { + validate_buffer_immediate( + ze_peer_devices[remote_device_id].engines[0].second, + ze_host_validate_buffer, ze_dst_buffers[remote_device_id], + ze_host_buffer, buffer_size); + } else { + validate_buffer(ze_peer_devices[remote_device_id].engines[0].second, + ze_peer_devices[remote_device_id].engines[0].first, + ze_host_validate_buffer, ze_dst_buffers[remote_device_id], + ze_host_buffer, buffer_size); + } for (size_t k = 0; k < buffer_size; k++) { ze_host_validate_buffer[k] = 5; } - - validate_buffer(ze_peer_devices[local_device_id].engines[0].second, - ze_peer_devices[local_device_id].engines[0].first, - ze_host_validate_buffer, ze_dst_buffers[local_device_id], - ze_host_buffer, buffer_size); + if (ZePeer::use_immediate_cmdlist) { + validate_buffer_immediate( + ze_peer_devices[local_device_id].engines[0].second, + ze_host_validate_buffer, ze_dst_buffers[local_device_id], + ze_host_buffer, buffer_size); + } else { + validate_buffer(ze_peer_devices[local_device_id].engines[0].second, + ze_peer_devices[local_device_id].engines[0].first, + ze_host_validate_buffer, ze_dst_buffers[local_device_id], + ze_host_buffer, buffer_size); + } } tear_down(remote_device_ids, local_device_ids); diff --git a/perf_tests/ze_peer/src/ze_peer_common.cpp b/perf_tests/ze_peer/src/ze_peer_common.cpp index 40584cbe..c1b27624 100644 --- a/perf_tests/ze_peer/src/ze_peer_common.cpp +++ b/perf_tests/ze_peer/src/ze_peer_common.cpp @@ -195,6 +195,15 @@ void ZePeer::initialize_src_buffer(ze_command_list_handle_t command_list, SUCCESS_OR_TERMINATE(zeCommandListReset(command_list)); } +void ZePeer::initialize_src_buffer_immediate( + ze_command_list_handle_t command_list, void *src_buffer, char *host_buffer, + size_t buffer_size) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + command_list, src_buffer, host_buffer, buffer_size, nullptr, 0, nullptr)); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + command_list, std::numeric_limits::max())); +} + void ZePeer::initialize_buffers(ze_command_list_handle_t command_list, ze_command_queue_handle_t command_queue, void *src_buffer, char *host_buffer, @@ -208,8 +217,13 @@ void ZePeer::initialize_buffers(ze_command_list_handle_t command_list, } if (src_buffer) { - initialize_src_buffer(command_list, command_queue, src_buffer, host_buffer, - buffer_size); + if (use_immediate_cmdlist) { + initialize_src_buffer_immediate(command_list, src_buffer, host_buffer, + buffer_size); + } else { + initialize_src_buffer(command_list, command_queue, src_buffer, + host_buffer, buffer_size); + } } } @@ -229,8 +243,13 @@ void ZePeer::initialize_buffers(std::vector &remote_device_ids, ze_command_queue_handle_t command_queue = ze_peer_devices[remote_device_id].engines[0].first; void *src_buffer = ze_src_buffers[remote_device_id]; - initialize_src_buffer(command_list, command_queue, src_buffer, host_buffer, - buffer_size); + if (ZePeer::use_immediate_cmdlist) { + initialize_src_buffer_immediate(command_list, src_buffer, host_buffer, + buffer_size); + } else { + initialize_src_buffer(command_list, command_queue, src_buffer, + host_buffer, buffer_size); + } } for (auto local_device_id : local_device_ids) { @@ -239,8 +258,13 @@ void ZePeer::initialize_buffers(std::vector &remote_device_ids, ze_command_queue_handle_t command_queue = ze_peer_devices[local_device_id].engines[0].first; void *src_buffer = ze_src_buffers[local_device_id]; - initialize_src_buffer(command_list, command_queue, src_buffer, host_buffer, - buffer_size); + if (ZePeer::use_immediate_cmdlist) { + initialize_src_buffer_immediate(command_list, src_buffer, host_buffer, + buffer_size); + } else { + initialize_src_buffer(command_list, command_queue, src_buffer, + host_buffer, buffer_size); + } } } @@ -268,3 +292,23 @@ void ZePeer::validate_buffer(ze_command_list_handle_t command_list, } } } + +void ZePeer::validate_buffer_immediate(ze_command_list_handle_t command_list, + char *validate_buffer, void *dst_buffer, + char *host_buffer, size_t buffer_size) { + SUCCESS_OR_TERMINATE( + zeCommandListAppendMemoryCopy(command_list, validate_buffer, dst_buffer, + buffer_size, nullptr, 0, nullptr)); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + command_list, std::numeric_limits::max())); + + for (size_t i = 0; i < buffer_size; i++) { + if (validate_buffer[i] != host_buffer[i]) { + std::cout << "Error at " << i << ": validate_buffer " + << static_cast(validate_buffer[i]) + << " != host_buffer " << static_cast(host_buffer[i]) + << "\n"; + break; + } + } +} diff --git a/perf_tests/ze_peer/src/ze_peer_ipc.cpp b/perf_tests/ze_peer/src/ze_peer_ipc.cpp index 80cfc32f..1872ca43 100644 --- a/perf_tests/ze_peer/src/ze_peer_ipc.cpp +++ b/perf_tests/ze_peer/src/ze_peer_ipc.cpp @@ -142,19 +142,36 @@ void ZePeer::bandwidth_latency_ipc(peer_test_t test_type, if (is_server == false) { if (transfer_type == PEER_READ) { - perform_copy(test_type, command_list, command_queue, - ze_buffers[local_device_id], ze_buffers[remote_device_id], - buffer_size); - + if (ZePeer::use_immediate_cmdlist) { + perform_copy_immediate(test_type, command_list, + ze_buffers[local_device_id], + ze_buffers[remote_device_id], buffer_size); + } else { + perform_copy(test_type, command_list, command_queue, + ze_buffers[local_device_id], ze_buffers[remote_device_id], + buffer_size); + } if (validate_results) { - validate_buffer(command_list, command_queue, ze_host_validate_buffer, - ze_buffers[local_device_id], ze_host_buffer, - buffer_size); + if (ZePeer::use_immediate_cmdlist) { + validate_buffer_immediate(command_list, ze_host_validate_buffer, + ze_buffers[local_device_id], ze_host_buffer, + buffer_size); + } else { + validate_buffer(command_list, command_queue, ze_host_validate_buffer, + ze_buffers[local_device_id], ze_host_buffer, + buffer_size); + } } } else { - perform_copy(test_type, command_list, command_queue, - ze_buffers[remote_device_id], ze_buffers[local_device_id], - buffer_size); + if (ZePeer::use_immediate_cmdlist) { + perform_copy_immediate(test_type, command_list, + ze_buffers[remote_device_id], + ze_buffers[local_device_id], buffer_size); + } else { + perform_copy(test_type, command_list, command_queue, + ze_buffers[remote_device_id], ze_buffers[local_device_id], + buffer_size); + } } } else { @@ -168,9 +185,15 @@ void ZePeer::bandwidth_latency_ipc(peer_test_t test_type, } if (transfer_type == PEER_WRITE) { if (validate_results) { - validate_buffer(command_list, command_queue, ze_host_validate_buffer, - ze_buffers[local_device_id], ze_host_buffer, - buffer_size); + if (ZePeer::use_immediate_cmdlist) { + validate_buffer_immediate(command_list, ze_host_validate_buffer, + ze_buffers[local_device_id], ze_host_buffer, + buffer_size); + } else { + validate_buffer(command_list, command_queue, ze_host_validate_buffer, + ze_buffers[local_device_id], ze_host_buffer, + buffer_size); + } } } } diff --git a/perf_tests/ze_peer/src/ze_peer_parallel_multiple_targets.cpp b/perf_tests/ze_peer/src/ze_peer_parallel_multiple_targets.cpp index 14bdca84..cbfe1008 100644 --- a/perf_tests/ze_peer/src/ze_peer_parallel_multiple_targets.cpp +++ b/perf_tests/ze_peer/src/ze_peer_parallel_multiple_targets.cpp @@ -370,6 +370,388 @@ void ZePeer::perform_bidirectional_parallel_copy_to_multiple_targets( } } +void ZePeer::perform_bidirectional_parallel_copy_to_multiple_targets_immediate( + peer_test_t test_type, peer_transfer_t transfer_type, + std::vector &remote_device_ids, + std::vector &local_device_ids, size_t buffer_size, + bool divide_buffers) { + + size_t total_buffer_size = 0; + + size_t num_engines = queues.size(); + size_t chunk = buffer_size / num_engines; + + size_t queue_index_iter = 0; + for (size_t local_device_id_iter = 0; + local_device_id_iter < local_device_ids.size(); local_device_id_iter++) { + for (size_t remote_device_id_iter = 0; + remote_device_id_iter < remote_device_ids.size(); + remote_device_id_iter++) { + + auto local_device_id = local_device_ids[local_device_id_iter]; + auto remote_device_id = remote_device_ids[remote_device_id_iter]; + + if (local_device_id == remote_device_id) { + continue; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + if (transfer_type == PEER_WRITE) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast( + ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast(reinterpret_cast( + ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast( + ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast(reinterpret_cast( + ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } else { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast( + ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast(reinterpret_cast( + ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast( + ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast(reinterpret_cast( + ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } + } + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + + if (transfer_type == PEER_WRITE) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queue_index].second, + ze_dst_buffers[remote_device_id], ze_src_buffers[local_device_id], + buffer_size, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queue_index].second, + ze_dst_buffers[local_device_id], ze_src_buffers[remote_device_id], + buffer_size, nullptr, 1, &event)); + } else { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queue_index].second, + ze_dst_buffers[local_device_id], ze_src_buffers[remote_device_id], + buffer_size, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queue_index].second, + ze_dst_buffers[remote_device_id], ze_src_buffers[local_device_id], + buffer_size, nullptr, 1, &event)); + } + } + + total_buffer_size += buffer_size; + } + } + + /* Warm up */ + for (uint32_t i = 0U; i < warm_up_iterations; i++) { + queue_index_iter = 0; + for (size_t local_device_id_iter = 0; + local_device_id_iter < local_device_ids.size(); + local_device_id_iter++) { + for (size_t remote_device_id_iter = 0; + remote_device_id_iter < remote_device_ids.size(); + remote_device_id_iter++) { + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + auto local_device_id = local_device_ids[local_device_id_iter]; + auto remote_device_id = remote_device_ids[remote_device_id_iter]; + + if (local_device_id == remote_device_id) { + continue; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast( + ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast(reinterpret_cast( + ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast( + ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast(reinterpret_cast( + ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queue_index].second, + ze_dst_buffers[local_device_id], ze_src_buffers[remote_device_id], + buffer_size, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queue_index].second, + ze_dst_buffers[remote_device_id], ze_src_buffers[local_device_id], + buffer_size, nullptr, 1, &event)); + } + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[local_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + } + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[local_device_id].engines[queue_index].second, + std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[remote_device_id].engines[queue_index].second, + std::numeric_limits::max())); + } + } + } + } + + Timer timer; + std::vector>> timers( + ze_peer_devices.size()); + for (size_t i = 0U; i < timers.size(); i++) { + timers[i].resize(ze_peer_devices.size()); + } + + std::vector> times(ze_peer_devices.size()); + for (size_t i = 0U; i < times.size(); i++) { + times[i].resize(ze_peer_devices.size()); + for (size_t j = 0U; j < times.size(); j++) { + times[i][j] = 0; + } + } + + do { + timer.start(); + for (uint32_t i = 0U; i < number_iterations; i++) { + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + queue_index_iter = 0; + for (size_t local_device_id_iter = 0; + local_device_id_iter < local_device_ids.size(); + local_device_id_iter++) { + for (size_t remote_device_id_iter = 0; + remote_device_id_iter < remote_device_ids.size(); + remote_device_id_iter++) { + + auto local_device_id = local_device_ids[local_device_id_iter]; + auto remote_device_id = remote_device_ids[remote_device_id_iter]; + + if (local_device_id == remote_device_id) { + continue; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + if (transfer_type == PEER_WRITE) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast( + ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast( + ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast( + ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast( + ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } else { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast( + ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast( + ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast( + ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast( + ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } + } + } else { + uint32_t queue_index = queues[queue_index_iter++ % + static_cast(queues.size())]; + + if (transfer_type == PEER_WRITE) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queue_index].second, + ze_dst_buffers[remote_device_id], + ze_src_buffers[local_device_id], buffer_size, nullptr, 1, + &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queue_index].second, + ze_dst_buffers[local_device_id], + ze_src_buffers[remote_device_id], buffer_size, nullptr, 1, + &event)); + } else { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queue_index].second, + ze_dst_buffers[local_device_id], + ze_src_buffers[remote_device_id], buffer_size, nullptr, 1, + &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queue_index].second, + ze_dst_buffers[remote_device_id], + ze_src_buffers[local_device_id], buffer_size, nullptr, 1, + &event)); + } + } + } + } + + for (size_t local_device_id_iter = 0; + local_device_id_iter < local_device_ids.size(); + local_device_id_iter++) { + for (size_t remote_device_id_iter = 0; + remote_device_id_iter < remote_device_ids.size(); + remote_device_id_iter++) { + + auto local_device_id = local_device_ids[local_device_id_iter]; + auto remote_device_id = remote_device_ids[remote_device_id_iter]; + + if (local_device_id == remote_device_id) { + continue; + } + + timers[local_device_id][remote_device_id].start(); + } + } + + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + + queue_index_iter = 0; + for (size_t local_device_id_iter = 0; + local_device_id_iter < local_device_ids.size(); + local_device_id_iter++) { + for (size_t remote_device_id_iter = 0; + remote_device_id_iter < remote_device_ids.size(); + remote_device_id_iter++) { + + auto local_device_id = local_device_ids[local_device_id_iter]; + auto remote_device_id = remote_device_ids[remote_device_id_iter]; + + if (local_device_id == remote_device_id) { + continue; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[local_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + } + } else { + uint32_t queue_index = queues[queue_index_iter++ % + static_cast(queues.size())]; + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[local_device_id].engines[queue_index].second, + std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[remote_device_id].engines[queue_index].second, + std::numeric_limits::max())); + } + timer.end(); + + timers[local_device_id][remote_device_id].end(); + times[local_device_id][remote_device_id] += + timers[local_device_id][remote_device_id].period_minus_overhead(); + } + } + } + + queue_index_iter = 0; + for (size_t local_device_id_iter = 0; + local_device_id_iter < local_device_ids.size(); + local_device_id_iter++) { + for (size_t remote_device_id_iter = 0; + remote_device_id_iter < remote_device_ids.size(); + remote_device_id_iter++) { + auto local_device_id = local_device_ids[local_device_id_iter]; + auto remote_device_id = remote_device_ids[remote_device_id_iter]; + + if (local_device_id == remote_device_id) { + continue; + } + + if (divide_buffers) { + std::cout << "\tDevice " << local_device_id << " - Device " + << remote_device_id << " : "; + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + + std::cout << "\tDevice " << local_device_id << " - Device " + << remote_device_id << " using queue " << queue_index + << ": "; + } + + print_results(true, test_type, buffer_size, + times[local_device_id][remote_device_id]); + } + } + print_results(true, test_type, total_buffer_size, timer); + } while (run_continuously); +} + void ZePeer::perform_parallel_copy_to_multiple_targets( peer_test_t test_type, peer_transfer_t transfer_type, std::vector &remote_device_ids, @@ -715,6 +1097,271 @@ void ZePeer::perform_parallel_copy_to_multiple_targets( } } +void ZePeer::perform_parallel_copy_to_multiple_targets_immediate( + peer_test_t test_type, peer_transfer_t transfer_type, + std::vector &remote_device_ids, + std::vector &local_device_ids, size_t buffer_size, + bool divide_buffers) { + + size_t total_buffer_size = 0; + + size_t num_engines = queues.size(); + size_t chunk = buffer_size / num_engines; + + size_t queue_index_iter = 0; + for (size_t local_device_id_iter = 0; + local_device_id_iter < local_device_ids.size(); local_device_id_iter++) { + for (size_t remote_device_id_iter = 0; + remote_device_id_iter < remote_device_ids.size(); + remote_device_id_iter++) { + + auto local_device_id = local_device_ids[local_device_id_iter]; + auto remote_device_id = remote_device_ids[remote_device_id_iter]; + + if (local_device_id == remote_device_id) { + continue; + } + + total_buffer_size += buffer_size; + } + } + + /* Warm up */ + queue_index_iter = 0; + for (uint32_t i = 0U; i < warm_up_iterations; i++) { + for (size_t local_device_id_iter = 0; + local_device_id_iter < local_device_ids.size(); + local_device_id_iter++) { + for (size_t remote_device_id_iter = 0; + remote_device_id_iter < remote_device_ids.size(); + remote_device_id_iter++) { + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + auto local_device_id = local_device_ids[local_device_id_iter]; + auto remote_device_id = remote_device_ids[remote_device_id_iter]; + + if (local_device_id == remote_device_id) { + continue; + } + + void *dst_buffer = ze_dst_buffers[remote_device_id]; + void *src_buffer = ze_src_buffers[local_device_id]; + if (transfer_type == PEER_READ) { + dst_buffer = ze_dst_buffers[local_device_id]; + src_buffer = ze_src_buffers[remote_device_id]; + } + + uint32_t device_id = local_device_id; + if (use_queue_in_destination) { + device_id = remote_device_id; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queues[e]].second; + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + command_list, + reinterpret_cast( + reinterpret_cast(dst_buffer) + e * chunk), + reinterpret_cast( + reinterpret_cast(src_buffer) + e * chunk), + chunk, nullptr, 1, &event)); + + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + + for (size_t e = 0; e < num_engines; e++) { + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queues[e]].second; + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + command_list, std::numeric_limits::max())); + } + } + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queue_index].second; + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + command_list, dst_buffer, src_buffer, buffer_size, nullptr, 1, + &event)); + + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + command_list, std::numeric_limits::max())); + } + } + } + } + + Timer timer; + std::vector>> timers( + ze_peer_devices.size()); + for (size_t i = 0U; i < timers.size(); i++) { + timers[i].resize(ze_peer_devices.size()); + } + + std::vector> times(ze_peer_devices.size()); + for (size_t i = 0U; i < times.size(); i++) { + times[i].resize(ze_peer_devices.size()); + for (size_t j = 0U; j < times.size(); j++) { + times[i][j] = 0; + } + } + + do { + timer.start(); + for (uint32_t i = 0U; i < number_iterations; i++) { + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + queue_index_iter = 0; + for (size_t local_device_id_iter = 0; + local_device_id_iter < local_device_ids.size(); + local_device_id_iter++) { + for (size_t remote_device_id_iter = 0; + remote_device_id_iter < remote_device_ids.size(); + remote_device_id_iter++) { + + auto local_device_id = local_device_ids[local_device_id_iter]; + auto remote_device_id = remote_device_ids[remote_device_id_iter]; + + if (local_device_id == remote_device_id) { + continue; + } + + void *dst_buffer = ze_dst_buffers[remote_device_id]; + void *src_buffer = ze_src_buffers[local_device_id]; + if (transfer_type == PEER_READ) { + dst_buffer = ze_dst_buffers[local_device_id]; + src_buffer = ze_src_buffers[remote_device_id]; + } + + uint32_t device_id = local_device_id; + if (use_queue_in_destination) { + device_id = remote_device_id; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queues[e]].second; + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + command_list, + reinterpret_cast( + reinterpret_cast(dst_buffer) + e * chunk), + reinterpret_cast( + reinterpret_cast(src_buffer) + e * chunk), + chunk, nullptr, 1, &event)); + } + } else { + uint32_t queue_index = queues[queue_index_iter++ % + static_cast(queues.size())]; + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queue_index].second; + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + command_list, dst_buffer, src_buffer, buffer_size, nullptr, 1, + &event)); + } + } + } + + for (size_t local_device_id_iter = 0; + local_device_id_iter < local_device_ids.size(); + local_device_id_iter++) { + for (size_t remote_device_id_iter = 0; + remote_device_id_iter < remote_device_ids.size(); + remote_device_id_iter++) { + + auto local_device_id = local_device_ids[local_device_id_iter]; + auto remote_device_id = remote_device_ids[remote_device_id_iter]; + + if (local_device_id == remote_device_id) { + continue; + } + + timers[local_device_id][remote_device_id].start(); + } + } + + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + + queue_index_iter = 0; + for (size_t local_device_id_iter = 0; + local_device_id_iter < local_device_ids.size(); + local_device_id_iter++) { + for (size_t remote_device_id_iter = 0; + remote_device_id_iter < remote_device_ids.size(); + remote_device_id_iter++) { + + auto local_device_id = local_device_ids[local_device_id_iter]; + auto remote_device_id = remote_device_ids[remote_device_id_iter]; + + if (local_device_id == remote_device_id) { + continue; + } + + uint32_t device_id = local_device_id; + if (use_queue_in_destination) { + device_id = remote_device_id; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queues[e]].second; + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + command_list, std::numeric_limits::max())); + } + } else { + uint32_t queue_index = queues[queue_index_iter++ % + static_cast(queues.size())]; + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queue_index].second; + + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + command_list, std::numeric_limits::max())); + } + + timer.end(); + + timers[local_device_id][remote_device_id].end(); + times[local_device_id][remote_device_id] += + timers[local_device_id][remote_device_id].period_minus_overhead(); + } + } + } + + queue_index_iter = 0; + for (size_t local_device_id_iter = 0; + local_device_id_iter < local_device_ids.size(); + local_device_id_iter++) { + for (size_t remote_device_id_iter = 0; + remote_device_id_iter < remote_device_ids.size(); + remote_device_id_iter++) { + auto local_device_id = local_device_ids[local_device_id_iter]; + auto remote_device_id = remote_device_ids[remote_device_id_iter]; + + if (local_device_id == remote_device_id) { + continue; + } + + if (divide_buffers) { + std::cout << "\tDevice " << local_device_id << " - Device " + << remote_device_id << " : "; + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + + std::cout << "\tDevice " << local_device_id << " - Device " + << remote_device_id << " using queue " << queue_index + << ": "; + } + print_results(false, test_type, buffer_size, + times[local_device_id][remote_device_id]); + } + } + print_results(false, test_type, total_buffer_size, timer); + } while (run_continuously); +} + void ZePeer::bandwidth_latency_parallel_to_multiple_targets( peer_test_t test_type, peer_transfer_t transfer_type, size_t number_buffer_elements, std::vector &remote_device_ids, @@ -729,13 +1376,25 @@ void ZePeer::bandwidth_latency_parallel_to_multiple_targets( buffer_size); if (bidirectional) { - perform_bidirectional_parallel_copy_to_multiple_targets( - test_type, transfer_type, remote_device_ids, local_device_ids, - buffer_size, divide_buffers); + if (ZePeer::use_immediate_cmdlist) { + perform_bidirectional_parallel_copy_to_multiple_targets_immediate( + test_type, transfer_type, remote_device_ids, local_device_ids, + buffer_size, divide_buffers); + } else { + perform_bidirectional_parallel_copy_to_multiple_targets( + test_type, transfer_type, remote_device_ids, local_device_ids, + buffer_size, divide_buffers); + } } else { - perform_parallel_copy_to_multiple_targets( - test_type, transfer_type, remote_device_ids, local_device_ids, - buffer_size, divide_buffers); + if (ZePeer::use_immediate_cmdlist) { + perform_parallel_copy_to_multiple_targets_immediate( + test_type, transfer_type, remote_device_ids, local_device_ids, + buffer_size, divide_buffers); + } else { + perform_parallel_copy_to_multiple_targets( + test_type, transfer_type, remote_device_ids, local_device_ids, + buffer_size, divide_buffers); + } } if (validate_results) { @@ -762,9 +1421,13 @@ void ZePeer::bandwidth_latency_parallel_to_multiple_targets( ze_peer_devices[local_device_id].engines[0].first; ze_command_list_handle_t command_list = ze_peer_devices[local_device_id].engines[0].second; - - validate_buffer(command_list, command_queue, ze_host_validate_buffer, - dst_buffer, ze_host_buffer, buffer_size); + if (ZePeer::use_immediate_cmdlist) { + validate_buffer_immediate(command_list, ze_host_validate_buffer, + dst_buffer, ze_host_buffer, buffer_size); + } else { + validate_buffer(command_list, command_queue, ze_host_validate_buffer, + dst_buffer, ze_host_buffer, buffer_size); + } for (size_t k = 0; k < buffer_size; k++) { ze_host_validate_buffer[k] = 0; } diff --git a/perf_tests/ze_peer/src/ze_peer_parallel_pair_targets.cpp b/perf_tests/ze_peer/src/ze_peer_parallel_pair_targets.cpp index 0fbc9099..6da61b45 100644 --- a/perf_tests/ze_peer/src/ze_peer_parallel_pair_targets.cpp +++ b/perf_tests/ze_peer/src/ze_peer_parallel_pair_targets.cpp @@ -332,6 +332,314 @@ void ZePeer::perform_bidirectional_parallel_copy_to_pair_targets( } } +void ZePeer::perform_bidirectional_parallel_copy_to_pair_targets_immediate( + peer_test_t test_type, peer_transfer_t transfer_type, + std::vector> &pair_device_ids, + size_t buffer_size, bool divide_buffers) { + + size_t total_buffer_size = 0; + + size_t num_engines = queues.size(); + size_t chunk = buffer_size / num_engines; + + size_t queue_index_iter = 0; + for (auto pair_device_id : pair_device_ids) { + auto local_device_id = pair_device_id.first; + auto remote_device_id = pair_device_id.second; + + if (local_device_id == remote_device_id) { + continue; + } + + total_buffer_size += buffer_size; + } + + /* Warm up */ + for (uint32_t i = 0U; i < warm_up_iterations; i++) { + queue_index_iter = 0; + for (auto pair_device_id : pair_device_ids) { + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + auto local_device_id = pair_device_id.first; + auto remote_device_id = pair_device_id.second; + + if (local_device_id == remote_device_id) { + continue; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + if (transfer_type == PEER_WRITE) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast( + ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast(reinterpret_cast( + ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast( + ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast(reinterpret_cast( + ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } else { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast( + ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast(reinterpret_cast( + ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast( + ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast(reinterpret_cast( + ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } + } + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + + if (transfer_type == PEER_WRITE) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queue_index].second, + ze_dst_buffers[remote_device_id], ze_src_buffers[local_device_id], + buffer_size, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queue_index].second, + ze_dst_buffers[local_device_id], ze_src_buffers[remote_device_id], + buffer_size, nullptr, 1, &event)); + } else { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queue_index].second, + ze_dst_buffers[local_device_id], ze_src_buffers[remote_device_id], + buffer_size, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queue_index].second, + ze_dst_buffers[remote_device_id], ze_src_buffers[local_device_id], + buffer_size, nullptr, 1, &event)); + } + } + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[local_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + } + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[local_device_id].engines[queue_index].second, + std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[remote_device_id].engines[queue_index].second, + std::numeric_limits::max())); + } + } + } + + Timer timer; + std::vector>> timers( + ze_peer_devices.size()); + for (size_t i = 0U; i < timers.size(); i++) { + timers[i].resize(ze_peer_devices.size()); + } + + std::vector> times(ze_peer_devices.size()); + for (size_t i = 0U; i < times.size(); i++) { + times[i].resize(ze_peer_devices.size()); + for (size_t j = 0U; j < times.size(); j++) { + times[i][j] = 0; + } + } + + do { + timer.start(); + for (uint32_t i = 0U; i < number_iterations; i++) { + queue_index_iter = 0; + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + for (auto pair_device_id : pair_device_ids) { + auto local_device_id = pair_device_id.first; + auto remote_device_id = pair_device_id.second; + + if (local_device_id == remote_device_id) { + continue; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + if (transfer_type == PEER_WRITE) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast( + ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast( + ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast( + ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast( + ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } else { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast( + ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast( + ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast( + ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast( + ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } + } + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + + if (transfer_type == PEER_WRITE) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queue_index].second, + ze_dst_buffers[remote_device_id], + ze_src_buffers[local_device_id], buffer_size, nullptr, 1, + &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queue_index].second, + ze_dst_buffers[local_device_id], + ze_src_buffers[remote_device_id], buffer_size, nullptr, 1, + &event)); + } else { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queue_index].second, + ze_dst_buffers[local_device_id], + ze_src_buffers[remote_device_id], buffer_size, nullptr, 1, + &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queue_index].second, + ze_dst_buffers[remote_device_id], + ze_src_buffers[local_device_id], buffer_size, nullptr, 1, + &event)); + } + } + } + + for (auto pair_device_id : pair_device_ids) { + auto local_device_id = pair_device_id.first; + auto remote_device_id = pair_device_id.second; + + if (local_device_id == remote_device_id) { + continue; + } + + timers[local_device_id][remote_device_id].start(); + } + + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + + queue_index_iter = 0; + for (auto pair_device_id : pair_device_ids) { + auto local_device_id = pair_device_id.first; + auto remote_device_id = pair_device_id.second; + + if (local_device_id == remote_device_id) { + continue; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[local_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + } + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[local_device_id].engines[queue_index].second, + std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[remote_device_id].engines[queue_index].second, + std::numeric_limits::max())); + } + timer.end(); + + timers[local_device_id][remote_device_id].end(); + times[local_device_id][remote_device_id] += + timers[local_device_id][remote_device_id].period_minus_overhead(); + } + } + + queue_index_iter = 0; + for (auto pair_device_id : pair_device_ids) { + auto local_device_id = pair_device_id.first; + auto remote_device_id = pair_device_id.second; + + if (local_device_id == remote_device_id) { + continue; + } + + if (divide_buffers) { + std::cout << "\tDevice " << local_device_id << " - Device " + << remote_device_id << " : "; + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + + std::cout << "\tDevice " << local_device_id << " - Device " + << remote_device_id << " using queue " << queue_index << ": "; + } + + print_results(true, test_type, buffer_size, + times[local_device_id][remote_device_id]); + } + print_results(true, test_type, total_buffer_size, timer); + } while (run_continuously); +} + void ZePeer::perform_parallel_copy_to_pair_targets( peer_test_t test_type, peer_transfer_t transfer_type, std::vector> &pair_device_ids, @@ -626,6 +934,234 @@ void ZePeer::perform_parallel_copy_to_pair_targets( } } +void ZePeer::perform_parallel_copy_to_pair_targets_immediate( + peer_test_t test_type, peer_transfer_t transfer_type, + std::vector> &pair_device_ids, + size_t buffer_size, bool divide_buffers) { + + size_t total_buffer_size = 0; + + size_t num_engines = queues.size(); + size_t chunk = buffer_size / num_engines; + + size_t queue_index_iter = 0; + for (auto pair_device_id : pair_device_ids) { + auto local_device_id = pair_device_id.first; + auto remote_device_id = pair_device_id.second; + + if (local_device_id == remote_device_id) { + continue; + } + + total_buffer_size += buffer_size; + } + + /* Warm up */ + for (uint32_t i = 0U; i < warm_up_iterations; i++) { + queue_index_iter = 0; + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + for (auto pair_device_id : pair_device_ids) { + auto local_device_id = pair_device_id.first; + auto remote_device_id = pair_device_id.second; + + if (local_device_id == remote_device_id) { + continue; + } + + uint32_t device_id = local_device_id; + if (use_queue_in_destination) { + device_id = remote_device_id; + } + + void *dst_buffer = ze_dst_buffers[remote_device_id]; + void *src_buffer = ze_src_buffers[local_device_id]; + if (transfer_type == PEER_READ) { + dst_buffer = ze_dst_buffers[local_device_id]; + src_buffer = ze_src_buffers[remote_device_id]; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queues[e]].second; + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + command_list, + reinterpret_cast(reinterpret_cast(dst_buffer) + + e * chunk), + reinterpret_cast(reinterpret_cast(src_buffer) + + e * chunk), + chunk, nullptr, 1, &event)); + } + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queue_index].second; + SUCCESS_OR_TERMINATE( + zeCommandListAppendMemoryCopy(command_list, dst_buffer, src_buffer, + buffer_size, nullptr, 1, &event)); + } + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queues[e]].second; + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + command_list, std::numeric_limits::max())); + } + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queue_index].second; + + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + command_list, std::numeric_limits::max())); + } + } + } + + Timer timer; + std::vector>> timers( + ze_peer_devices.size()); + for (size_t i = 0U; i < timers.size(); i++) { + timers[i].resize(ze_peer_devices.size()); + } + + std::vector> times(ze_peer_devices.size()); + for (size_t i = 0U; i < times.size(); i++) { + times[i].resize(ze_peer_devices.size()); + for (size_t j = 0U; j < times.size(); j++) { + times[i][j] = 0; + } + } + + do { + timer.start(); + for (uint32_t i = 0U; i < number_iterations; i++) { + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + queue_index_iter = 0; + for (auto pair_device_id : pair_device_ids) { + auto local_device_id = pair_device_id.first; + auto remote_device_id = pair_device_id.second; + + if (local_device_id == remote_device_id) { + continue; + } + + uint32_t device_id = local_device_id; + if (use_queue_in_destination) { + device_id = remote_device_id; + } + + void *dst_buffer = ze_dst_buffers[remote_device_id]; + void *src_buffer = ze_src_buffers[local_device_id]; + if (transfer_type == PEER_READ) { + dst_buffer = ze_dst_buffers[local_device_id]; + src_buffer = ze_src_buffers[remote_device_id]; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queues[e]].second; + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + command_list, + reinterpret_cast( + reinterpret_cast(dst_buffer) + e * chunk), + reinterpret_cast( + reinterpret_cast(src_buffer) + e * chunk), + chunk, nullptr, 1, &event)); + } + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queue_index].second; + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + command_list, dst_buffer, src_buffer, buffer_size, nullptr, 1, + &event)); + } + } + + for (auto pair_device_id : pair_device_ids) { + auto local_device_id = pair_device_id.first; + auto remote_device_id = pair_device_id.second; + + if (local_device_id == remote_device_id) { + continue; + } + + timers[local_device_id][remote_device_id].start(); + } + + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + + queue_index_iter = 0; + for (auto pair_device_id : pair_device_ids) { + auto local_device_id = pair_device_id.first; + auto remote_device_id = pair_device_id.second; + + if (local_device_id == remote_device_id) { + continue; + } + + uint32_t device_id = local_device_id; + if (use_queue_in_destination) { + device_id = remote_device_id; + } + + if (divide_buffers) { + for (size_t e = 0; e < num_engines; e++) { + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queues[e]].second; + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + command_list, std::numeric_limits::max())); + } + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + ze_command_list_handle_t command_list = + ze_peer_devices[device_id].engines[queue_index].second; + + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + command_list, std::numeric_limits::max())); + } + + timer.end(); + + timers[local_device_id][remote_device_id].end(); + times[local_device_id][remote_device_id] += + timers[local_device_id][remote_device_id].period_minus_overhead(); + } + } + + queue_index_iter = 0; + for (auto pair_device_id : pair_device_ids) { + auto local_device_id = pair_device_id.first; + auto remote_device_id = pair_device_id.second; + + if (local_device_id == remote_device_id) { + continue; + } + + if (divide_buffers) { + std::cout << "\tDevice " << local_device_id << " - Device " + << remote_device_id << " : "; + } else { + uint32_t queue_index = + queues[queue_index_iter++ % static_cast(queues.size())]; + + std::cout << "\tDevice " << local_device_id << " - Device " + << remote_device_id << " using queue " << queue_index << ": "; + } + print_results(false, test_type, buffer_size, + times[local_device_id][remote_device_id]); + } + print_results(false, test_type, total_buffer_size, timer); + } while (run_continuously); +} + void ZePeer::bandwidth_latency_parallel_to_pair_targets( peer_test_t test_type, peer_transfer_t transfer_type, size_t number_buffer_elements, @@ -660,11 +1196,25 @@ void ZePeer::bandwidth_latency_parallel_to_pair_targets( buffer_size); if (bidirectional) { - perform_bidirectional_parallel_copy_to_pair_targets( - test_type, transfer_type, pair_device_ids, buffer_size, divide_buffers); + if (ZePeer::use_immediate_cmdlist) { + perform_bidirectional_parallel_copy_to_pair_targets_immediate( + test_type, transfer_type, pair_device_ids, buffer_size, + divide_buffers); + } else { + perform_bidirectional_parallel_copy_to_pair_targets( + test_type, transfer_type, pair_device_ids, buffer_size, + divide_buffers); + } } else { - perform_parallel_copy_to_pair_targets( - test_type, transfer_type, pair_device_ids, buffer_size, divide_buffers); + if (ZePeer::use_immediate_cmdlist) { + perform_parallel_copy_to_pair_targets_immediate( + test_type, transfer_type, pair_device_ids, buffer_size, + divide_buffers); + } else { + perform_parallel_copy_to_pair_targets(test_type, transfer_type, + pair_device_ids, buffer_size, + divide_buffers); + } } if (validate_results) { @@ -686,8 +1236,13 @@ void ZePeer::bandwidth_latency_parallel_to_pair_targets( ze_command_list_handle_t command_list = ze_peer_devices[local_device_id].engines[0].second; - validate_buffer(command_list, command_queue, ze_host_validate_buffer, - dst_buffer, ze_host_buffer, buffer_size); + if (ZePeer::use_immediate_cmdlist) { + validate_buffer_immediate(command_list, ze_host_validate_buffer, + dst_buffer, ze_host_buffer, buffer_size); + } else { + validate_buffer(command_list, command_queue, ze_host_validate_buffer, + dst_buffer, ze_host_buffer, buffer_size); + } for (size_t k = 0; k < buffer_size; k++) { ze_host_validate_buffer[k] = 0; } diff --git a/perf_tests/ze_peer/src/ze_peer_parallel_single_target.cpp b/perf_tests/ze_peer/src/ze_peer_parallel_single_target.cpp index adebe557..d60d6104 100644 --- a/perf_tests/ze_peer/src/ze_peer_parallel_single_target.cpp +++ b/perf_tests/ze_peer/src/ze_peer_parallel_single_target.cpp @@ -110,6 +110,7 @@ void ZePeer::perform_bidirectional_parallel_copy_to_single_target( ze_peer_devices[remote_device_id].engines[queues[e]].first, std::numeric_limits::max())); } + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); } timer.end(); @@ -124,6 +125,134 @@ void ZePeer::perform_bidirectional_parallel_copy_to_single_target( } } +void ZePeer::perform_bidirectional_parallel_copy_to_single_target_immediate( + peer_test_t test_type, peer_transfer_t transfer_type, + uint32_t remote_device_id, uint32_t local_device_id, size_t buffer_size) { + size_t num_engines = queues.size(); + size_t chunk = buffer_size / num_engines; + + Timer timer; + + /* Warm up */ + for (uint32_t i = 0U; i < warm_up_iterations; i++) { + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + for (size_t e = 0; e < num_engines; e++) { + if (transfer_type == PEER_WRITE) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast(ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast(ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast(ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast(ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } else { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast(ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast(ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast(ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast(ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } + } + + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + + for (size_t e = 0; e < num_engines; e++) { + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[local_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + } + } + + do { + timer.start(); + for (uint32_t i = 0U; i < number_iterations; i++) { + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + for (size_t e = 0; e < num_engines; e++) { + if (transfer_type == PEER_WRITE) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast(ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast(ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast(ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast(ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } else { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[local_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast(ze_dst_buffers[local_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast(ze_src_buffers[remote_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + reinterpret_cast( + reinterpret_cast(ze_dst_buffers[remote_device_id]) + + e * chunk), + reinterpret_cast( + reinterpret_cast(ze_src_buffers[local_device_id]) + + e * chunk), + chunk, nullptr, 1, &event)); + } + } + + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + + for (size_t e = 0; e < num_engines; e++) { + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[local_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[remote_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + } + } + timer.end(); + + print_results(true, test_type, buffer_size, timer); + } while (run_continuously); +} + void ZePeer::perform_parallel_copy_to_single_target( peer_test_t test_type, peer_transfer_t transfer_type, uint32_t remote_device_id, uint32_t local_device_id, size_t buffer_size) { @@ -209,6 +338,87 @@ void ZePeer::perform_parallel_copy_to_single_target( } } +void ZePeer::perform_parallel_copy_to_single_target_immediate( + peer_test_t test_type, peer_transfer_t transfer_type, + uint32_t remote_device_id, uint32_t local_device_id, size_t buffer_size) { + uint32_t queue_device_id = local_device_id; + if (use_queue_in_destination) { + queue_device_id = remote_device_id; + } + + void *dst_buffer = ze_dst_buffers[remote_device_id]; + void *src_buffer = ze_src_buffers[local_device_id]; + if (transfer_type == PEER_READ) { + dst_buffer = ze_dst_buffers[local_device_id]; + src_buffer = ze_src_buffers[remote_device_id]; + } + + size_t num_engines = queues.size(); + size_t chunk = buffer_size / num_engines; + for (size_t e = 0; e < num_engines; e++) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[queue_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast(dst_buffer) + + e * chunk), + reinterpret_cast(reinterpret_cast(src_buffer) + + e * chunk), + chunk, nullptr, 1, &event)); + } + + Timer timer; + + /* Warm up */ + for (uint32_t i = 0U; i < warm_up_iterations; i++) { + for (size_t e = 0; e < num_engines; e++) { + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[queue_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast(dst_buffer) + + e * chunk), + reinterpret_cast(reinterpret_cast(src_buffer) + + e * chunk), + chunk, nullptr, 1, &event)); + } + + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + + for (size_t e = 0; e < num_engines; e++) { + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[queue_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + } + } + + do { + long double time_usec = 0; + for (uint32_t i = 0U; i < number_iterations; i++) { + SUCCESS_OR_TERMINATE(zeEventHostReset(event)); + for (size_t e = 0; e < num_engines; e++) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy( + ze_peer_devices[queue_device_id].engines[queues[e]].second, + reinterpret_cast(reinterpret_cast(dst_buffer) + + e * chunk), + reinterpret_cast(reinterpret_cast(src_buffer) + + e * chunk), + chunk, nullptr, 1, &event)); + } + + timer.start(); + SUCCESS_OR_TERMINATE(zeEventHostSignal(event)); + + for (size_t e = 0; e < num_engines; e++) { + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + ze_peer_devices[queue_device_id].engines[queues[e]].second, + std::numeric_limits::max())); + } + timer.end(); + time_usec += timer.period_minus_overhead(); + } + + print_results(false, test_type, buffer_size, time_usec); + } while (run_continuously); +} + void ZePeer::bandwidth_latency_parallel_to_single_target( peer_test_t test_type, peer_transfer_t transfer_type, size_t number_buffer_elements, uint32_t remote_device_id, @@ -224,13 +434,25 @@ void ZePeer::bandwidth_latency_parallel_to_single_target( buffer_size); if (bidirectional) { - perform_bidirectional_parallel_copy_to_single_target( - test_type, transfer_type, remote_device_id, local_device_id, - buffer_size); + if (ZePeer::use_immediate_cmdlist) { + perform_bidirectional_parallel_copy_to_single_target_immediate( + test_type, transfer_type, remote_device_id, local_device_id, + buffer_size); + } else { + perform_bidirectional_parallel_copy_to_single_target( + test_type, transfer_type, remote_device_id, local_device_id, + buffer_size); + } } else { - perform_parallel_copy_to_single_target(test_type, transfer_type, - remote_device_id, local_device_id, - buffer_size); + if (ZePeer::use_immediate_cmdlist) { + perform_parallel_copy_to_single_target_immediate( + test_type, transfer_type, remote_device_id, local_device_id, + buffer_size); + } else { + perform_parallel_copy_to_single_target(test_type, transfer_type, + remote_device_id, local_device_id, + buffer_size); + } } if (validate_results) { @@ -243,9 +465,13 @@ void ZePeer::bandwidth_latency_parallel_to_single_target( if (transfer_type == PEER_READ) { dst_buffer = ze_dst_buffers[local_device_id]; } - - validate_buffer(command_list, command_queue, ze_host_validate_buffer, - dst_buffer, ze_host_buffer, buffer_size); + if (ZePeer::use_immediate_cmdlist) { + validate_buffer_immediate(command_list, ze_host_validate_buffer, + dst_buffer, ze_host_buffer, buffer_size); + } else { + validate_buffer(command_list, command_queue, ze_host_validate_buffer, + dst_buffer, ze_host_buffer, buffer_size); + } } tear_down(remote_device_ids, local_device_ids); diff --git a/perf_tests/ze_peer/src/ze_peer_unidirectional.cpp b/perf_tests/ze_peer/src/ze_peer_unidirectional.cpp index f1d1aea1..6bdae7f7 100644 --- a/perf_tests/ze_peer/src/ze_peer_unidirectional.cpp +++ b/perf_tests/ze_peer/src/ze_peer_unidirectional.cpp @@ -43,6 +43,38 @@ void ZePeer::perform_copy(peer_test_t test_type, SUCCESS_OR_TERMINATE(zeCommandListReset(command_list)); } +void ZePeer::perform_copy_immediate(peer_test_t test_type, + ze_command_list_handle_t command_list, + void *dst_buffer, void *src_buffer, + size_t buffer_size) { + Timer timer; + + /* Warm up */ + for (uint32_t i = 0U; i < warm_up_iterations; i++) { + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, dst_buffer, + src_buffer, buffer_size, + nullptr, 0, nullptr)); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + command_list, std::numeric_limits::max())); + } + + do { + timer.start(); + for (uint32_t i = 0U; i < number_iterations; i++) { + SUCCESS_OR_TERMINATE( + zeCommandListAppendMemoryCopy(command_list, dst_buffer, src_buffer, + buffer_size, nullptr, 0, nullptr)); + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize( + command_list, std::numeric_limits::max())); + } + timer.end(); + + print_results(false, test_type, buffer_size, timer); + } while (run_continuously); + + SUCCESS_OR_TERMINATE(zeCommandListReset(command_list)); +} + void ZePeer::bandwidth_latency(peer_test_t test_type, peer_transfer_t transfer_type, size_t number_buffer_elements, @@ -71,12 +103,22 @@ void ZePeer::bandwidth_latency(peer_test_t test_type, initialize_buffers(remote_device_ids, local_device_ids, ze_host_buffer, buffer_size); - perform_copy(test_type, command_list, command_queue, dst_buffer, src_buffer, - buffer_size); + if (ZePeer::use_immediate_cmdlist) { + perform_copy_immediate(test_type, command_list, dst_buffer, src_buffer, + buffer_size); + } else { + perform_copy(test_type, command_list, command_queue, dst_buffer, src_buffer, + buffer_size); + } if (validate_results) { - validate_buffer(command_list, command_queue, ze_host_validate_buffer, - dst_buffer, ze_host_buffer, buffer_size); + if (ZePeer::use_immediate_cmdlist) { + validate_buffer_immediate(command_list, ze_host_validate_buffer, + dst_buffer, ze_host_buffer, buffer_size); + } else { + validate_buffer(command_list, command_queue, ze_host_validate_buffer, + dst_buffer, ze_host_buffer, buffer_size); + } } tear_down(remote_device_ids, local_device_ids);