mirror of
https://forge.sourceware.org/marek/gcc.git
synced 2026-02-22 03:47:02 -05:00
match.pd: Fold (x + y) >> 1 into IFN_AVG_FLOOR (x, y) for vectors
This patch folds vector expressions of the form (x + y) >> 1 into
IFN_AVG_FLOOR (x, y), reducing instruction count on platforms that
support averaging operations. For example, it can help improve the
codegen on AArch64 from:
add v0.4s, v0.4s, v31.4s
ushr v0.4s, v0.4s, 1
to:
uhadd v0.4s, v0.4s, v31.4s
As this folding is only valid when the most significant bit of each
element in both x and y is known to be zero, this patch checks leading
zero bits of elements in x and y, and extends get_nonzero_bits_1() to
handle uniform vectors. When the input is a uniform vector, the function
now returns the nonzero bits of its element.
Additionally, this patch adds more checks to reject vector types in bit
constant propagation (tree-bit-ccp), since tree-bit-ccp was designed for
scalar values only, and the new vector logic in get_non_zero_bits_1()
could lead to incorrect propagation results.
gcc/ChangeLog:
* match.pd: Add folding rule for vector average.
* tree-ssa-ccp.cc (get_default_value): Reject vector types.
(evaluate_stmt): Reject vector types.
* tree-ssanames.cc (get_nonzero_bits_1): Extend to handle
uniform vectors.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/acle/uhadd_1.c: New test.
This commit is contained in:
committed by
Tamar Christina
parent
54e4f75b7d
commit
7bb1933c1f
12
gcc/match.pd
12
gcc/match.pd
@@ -2177,6 +2177,18 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
|
||||
(view_convert (rshift (view_convert:ntype @0) @1))
|
||||
(convert (rshift (convert:ntype @0) @1))))))
|
||||
|
||||
#if GIMPLE
|
||||
/* Fold ((x + y) >> 1 into IFN_AVG_FLOOR (x, y) if x and y are vectors in
|
||||
which each element is known to have at least one leading zero bit. */
|
||||
(simplify
|
||||
(rshift (plus:cs @0 @1) integer_onep)
|
||||
(if (VECTOR_TYPE_P (type)
|
||||
&& direct_internal_fn_supported_p (IFN_AVG_FLOOR, type, OPTIMIZE_FOR_BOTH)
|
||||
&& wi::clz (get_nonzero_bits (@0)) > 0
|
||||
&& wi::clz (get_nonzero_bits (@1)) > 0)
|
||||
(IFN_AVG_FLOOR @0 @1)))
|
||||
#endif
|
||||
|
||||
/* Try to fold (type) X op CST -> (type) (X op ((type-x) CST))
|
||||
when profitable.
|
||||
For bitwise binary operations apply operand conversions to the
|
||||
|
||||
34
gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
Normal file
34
gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
Normal file
@@ -0,0 +1,34 @@
|
||||
/* Test if SIMD fused unsigned halving adds are generated */
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2" } */
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define FUSED_SIMD_UHADD(vectype, q, ts, mask) \
|
||||
vectype simd_uhadd ## q ## _ ## ts ## _1 (vectype a) \
|
||||
{ \
|
||||
vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \
|
||||
vectype v2 = vdup ## q ## _n_ ## ts (mask); \
|
||||
return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \
|
||||
} \
|
||||
\
|
||||
vectype simd_uhadd ## q ## _ ## ts ## _2 (vectype a, vectype b) \
|
||||
{ \
|
||||
vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \
|
||||
vectype v2 = vand ## q ## _ ## ts (b, vdup ## q ## _n_ ## ts (mask)); \
|
||||
return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \
|
||||
}
|
||||
|
||||
FUSED_SIMD_UHADD (uint8x8_t, , u8, 0x7f)
|
||||
FUSED_SIMD_UHADD (uint8x16_t, q, u8, 0x7f)
|
||||
FUSED_SIMD_UHADD (uint16x4_t, , u16, 0x7fff)
|
||||
FUSED_SIMD_UHADD (uint16x8_t, q, u16, 0x7fff)
|
||||
FUSED_SIMD_UHADD (uint32x2_t, , u32, 0x7fffffff)
|
||||
FUSED_SIMD_UHADD (uint32x4_t, q, u32, 0x7fffffff)
|
||||
|
||||
/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8b,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.16b,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4h,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8h,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.2s,} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4s,} 2 } } */
|
||||
@@ -298,7 +298,7 @@ get_default_value (tree var)
|
||||
{
|
||||
val.lattice_val = VARYING;
|
||||
val.mask = -1;
|
||||
if (flag_tree_bit_ccp)
|
||||
if (flag_tree_bit_ccp && !VECTOR_TYPE_P (TREE_TYPE (var)))
|
||||
{
|
||||
wide_int nonzero_bits = get_nonzero_bits (var);
|
||||
tree value;
|
||||
@@ -2491,11 +2491,11 @@ evaluate_stmt (gimple *stmt)
|
||||
is_constant = (val.lattice_val == CONSTANT);
|
||||
}
|
||||
|
||||
tree lhs = gimple_get_lhs (stmt);
|
||||
if (flag_tree_bit_ccp
|
||||
&& lhs && TREE_CODE (lhs) == SSA_NAME && !VECTOR_TYPE_P (TREE_TYPE (lhs))
|
||||
&& ((is_constant && TREE_CODE (val.value) == INTEGER_CST)
|
||||
|| !is_constant)
|
||||
&& gimple_get_lhs (stmt)
|
||||
&& TREE_CODE (gimple_get_lhs (stmt)) == SSA_NAME)
|
||||
|| !is_constant))
|
||||
{
|
||||
tree lhs = gimple_get_lhs (stmt);
|
||||
wide_int nonzero_bits = get_nonzero_bits (lhs);
|
||||
|
||||
@@ -508,6 +508,14 @@ get_nonzero_bits_1 (const_tree name)
|
||||
/* Use element_precision instead of TYPE_PRECISION so complex and
|
||||
vector types get a non-zero precision. */
|
||||
unsigned int precision = element_precision (TREE_TYPE (name));
|
||||
|
||||
if (VECTOR_TYPE_P (TREE_TYPE (name)))
|
||||
{
|
||||
tree elem = uniform_vector_p (name);
|
||||
if (elem)
|
||||
return get_nonzero_bits_1 (elem);
|
||||
}
|
||||
|
||||
if (TREE_CODE (name) != SSA_NAME)
|
||||
return wi::shwi (-1, precision);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user