Files
Richard Sandiford 733a6a4c11 libgomp: Update SVE tests
The new SVE tests didn't explicitly force SVE to be enabled,
which meant that they wouldn't work on targets that aren't
configured for SVE by default.  The least invasive way of
fixing that is to add a pragma, which works for most tests.
However, for udr-sve.c, the global:

 #pragma omp declare reduction (+:svint32_t: omp_out = svadd_s32_z (svptrue_b32(), omp_in, omp_out)) \
		    initializer (omp_priv = svindex_s32 (0, 0))

does not work with an earlier:

 #pragma GCC target "+sve"

which is interesting, and maybe worthy of a PR if there isn't one
already.  It seems we have to force SVE (and thus an architecture)
on the command line instead.

However, with that fixed, udr-sve.c fails execution.  One problem
seems to be a missing accumulation in for_reduction.  Fixing that
is enough to reach the final inscan_reduction_incl, but that fails
for reasons I haven't investigated yet.  I would need to read up
more to understand what the loop is doing.

It also looks like there might be a missing "+" in simd_reduction:

  #pragma omp simd reduction (+:va, i)
  for (j = 0; j < 16; j++)
    va = svld1_s32 (svptrue_b32 (), a);

  res = svaddv_s32 (svptrue_b32 (), va);

  if (res != 8)
    __builtin_abort ();

since AFAICT the loop is not doing a reduction as things stand.
But perhaps that's deliberate, since it does match the != 8 test.

libgomp/
	* testsuite/libgomp.c-target/aarch64/firstprivate.c: Add +sve pragma.
	* testsuite/libgomp.c-target/aarch64/lastprivate.c: Likewise.
	* testsuite/libgomp.c-target/aarch64/private.c: Likewise.
	* testsuite/libgomp.c-target/aarch64/shared.c: Likewise.
	* testsuite/libgomp.c-target/aarch64/simd-aligned.c: Likewise.
	* testsuite/libgomp.c-target/aarch64/simd-nontemporal.c: Likewise.
	* testsuite/libgomp.c-target/aarch64/threadprivate.c: Likewise.
	* testsuite/libgomp.c-target/aarch64/udr-sve.c: Add an -march option.
	(for_reduction): Use "+=" in the reduction loop.
2025-04-10 21:09:07 +01:00

267 lines
6.3 KiB
C

/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-msve-vector-bits=256 -fopenmp -O2" } */
#pragma GCC target "+sve"
#include <arm_sve.h>
#include <stdlib.h>
#include <omp.h>
static void __attribute__ ((noipa))
compare_vec (svint32_t x, svint32_t y)
{
svbool_t p = svnot_b_z (svptrue_b32 (), svcmpeq_s32 (svptrue_b32 (), x, y));
if (svptest_any (svptrue_b32 (), p))
__builtin_abort ();
}
static void __attribute__ ((noipa))
compare_vecb (svbool_t x, svbool_t y)
{
svbool_t p = sveor_b_z (svptrue_b32 (), x, y);
if (svptest_any (svptrue_b32 (), p))
__builtin_abort ();
}
void __attribute__ ((noipa))
implicit_shared_default (svint32_t a, svint32_t b, svbool_t p)
{
#pragma omp parallel default (shared) num_threads (10)
{
/* 'a', 'b' and 'p' are implicitly shared. */
compare_vec (a, svindex_s32 (0, 1));
compare_vec (b, svindex_s32 (8, 1));
compare_vecb (p, svptrue_b32 ());
#pragma omp barrier
if (omp_get_thread_num () == 2)
a = svadd_s32_z (p, a, b);
#pragma omp barrier
if (omp_get_thread_num () == 0)
{
compare_vec (a, svindex_s32 (8, 2));
compare_vec (b, svindex_s32 (8, 1));
compare_vecb (p, svptrue_b32 ());
b = svadd_s32_z (p, a, b);
}
#pragma omp barrier
compare_vec (a, svindex_s32 (8, 2));
compare_vec (b, svadd_s32_z (p, svindex_s32 (8, 2), svindex_s32 (8, 1)));
#pragma omp barrier
if (omp_get_thread_num () == 0 || omp_get_thread_num () == 2)
{
compare_vec (a, svindex_s32 (8, 2));
compare_vec (b, svadd_s32_z (p, svindex_s32 (8, 2), svindex_s32 (8, 1)));
}
}
}
void __attribute__ ((noipa))
explicit_shared (svint32_t a, svint32_t b, svbool_t p)
{
#pragma omp parallel shared (a, b, p) num_threads (12)
{
/* 'a', 'b' and 'p' are explicitly shared. */
compare_vec (a, svindex_s32 (0, 1));
compare_vec (b, svindex_s32 (8, 1));
compare_vecb (p, svptrue_b32 ());
#pragma omp barrier
if (omp_get_thread_num () == 2)
a = svadd_s32_z (p, a, b);
#pragma omp barrier
if (omp_get_thread_num () == 0)
{
compare_vec (a, svindex_s32 (8, 2));
compare_vec (b, svindex_s32 (8, 1));
compare_vecb (p, svptrue_b32 ());
b = svadd_s32_z (p, a, b);
}
#pragma omp barrier
compare_vec (a, svindex_s32 (8, 2));
compare_vec (b, svadd_s32_z (p, svindex_s32 (8, 2), svindex_s32 (8, 1)));
#pragma omp barrier
if (omp_get_thread_num () == 0 || omp_get_thread_num () == 2)
{
compare_vec (a, svindex_s32 (8, 2));
compare_vec (b, svadd_s32_z (p, svindex_s32 (8, 2), svindex_s32 (8, 1)));
}
}
}
void __attribute__ ((noipa))
implicit_shared_no_default (svint32_t a, svint32_t b, svbool_t p)
{
#pragma omp parallel num_threads (16)
{
/* 'a', 'b' and 'p' are implicitly shared without default clause. */
compare_vec (a, svindex_s32 (0, 1));
compare_vec (b, svindex_s32 (8, 1));
compare_vecb (p, svptrue_b32 ());
#pragma omp barrier
if (omp_get_thread_num () == 12)
a = svadd_s32_z (p, a, b);
#pragma omp barrier
if (omp_get_thread_num () == 15)
{
compare_vec (a, svindex_s32 (8, 2));
compare_vec (b, svindex_s32 (8, 1));
compare_vecb (p, svptrue_b32 ());
b = svadd_s32_z (p, a, b);
}
#pragma omp barrier
compare_vec (a, svindex_s32 (8, 2));
compare_vec (b, svadd_s32_z (p, svindex_s32 (8, 2), svindex_s32 (8, 1)));
#pragma omp barrier
if (omp_get_thread_num () == 12 || omp_get_thread_num () == 15)
{
compare_vec (a, svindex_s32 (8, 2));
compare_vec (b, svadd_s32_z (p, svindex_s32 (8, 2), svindex_s32 (8, 1)));
}
}
}
void __attribute__ ((noipa))
mix_shared (svint32_t b, svbool_t p)
{
svint32_t a = svindex_s32 (0, 0);
int *m = (int *) malloc (8 * sizeof (int));
int i;
#pragma omp parallel for
for (i = 0; i < 8; i++)
m[i] = i;
#pragma omp parallel num_threads (16)
{
compare_vec (a, svindex_s32 (0, 0));
compare_vec (b, svindex_s32 (8, 1));
#pragma omp barrier
/* 'm' is predetermined shared here. 'a' is implicitly shared here. */
if (omp_get_thread_num () == 10)
a = svld1_s32 (svptrue_b32 (), m);
#pragma omp barrier
/* 'a', 'b' and 'p' are implicitly shared without default clause. */
compare_vec (a, svindex_s32 (0, 1));
compare_vec (b, svindex_s32 (8, 1));
compare_vecb (p, svptrue_b32 ());
#pragma omp barrier
if (omp_get_thread_num () == 12)
a = svadd_s32_z (p, a, b);
#pragma omp barrier
if (omp_get_thread_num () == 15)
{
compare_vec (a, svindex_s32 (8, 2));
compare_vec (b, svindex_s32 (8, 1));
compare_vecb (p, svptrue_b32 ());
b = svadd_s32_z (p, a, b);
}
#pragma omp barrier
if (omp_get_thread_num () == 12 || omp_get_thread_num () == 15)
{
compare_vec (a, svindex_s32 (8, 2));
compare_vec (b, svadd_s32_z (p, svindex_s32 (8, 2), svindex_s32 (8, 1)));
}
#pragma omp barrier
compare_vec (a, svindex_s32 (8, 2));
compare_vec (b, svadd_s32_z (p, svindex_s32 (8, 2), svindex_s32 (8, 1)));
}
}
#define N __ARM_FEATURE_SVE_BITS
#define FIXED_ATTR __attribute__((arm_sve_vector_bits (N)))
typedef svint32_t v8si FIXED_ATTR;
void __attribute__ ((noipa))
predetermined_shared_static (int n)
{
int *m = (int *) malloc (8 * sizeof (int));
int i;
#pragma omp parallel for
/* 'm' is predetermined shared here. */
for (i = 0; i < 8; i++)
m[i] = i;
static v8si a = { 0, 1, 2, 3, 4, 5, 6, 7 };
#pragma omp parallel num_threads (16)
{
/* 'a' is implicit shared here. */
if (n == 0)
compare_vec (a, svindex_s32 (0, 1));
if (n == 1)
compare_vec (a, svindex_s32 (1, 1));
#pragma omp barrier
if (omp_get_thread_num () == 12)
{
if (n == 0)
compare_vec (a, svindex_s32 (0, 1));
if (n == 1)
compare_vec (a, svindex_s32 (1, 1));
a = svadd_s32_z (svptrue_b32 (), a, svindex_s32 (1, 0));
}
#pragma omp barrier
if (n == 0)
compare_vec (a, svindex_s32 (1, 1));
if (n == 1)
compare_vec (a, svindex_s32 (2, 1));
}
}
int
main ()
{
svint32_t x = svindex_s32 (0, 1);
svint32_t y = svindex_s32 (8, 1);
svbool_t p = svptrue_b32 ();
/* Implicit shared. */
implicit_shared_default (x, y, p);
/* Explicit shared. */
explicit_shared (x, y, p);
/* Implicit shared with no default clause. */
implicit_shared_no_default (x, y, p);
/* Mix shared. */
mix_shared (y, p);
/* Predetermined and static shared. */
predetermined_shared_static (0);
predetermined_shared_static (1);
}