tree-optimization/121395 - SLP of SIMD calls w/o LHS

The following records the alternate SLP instance entries coming from
stmts with stores that have no SSA def, like OMP SIMD calls without LHS.
There's a bit of fallout with having a SLP tree with a NULL vectype,
but nothing too gross.

	PR tree-optimization/121395
	* tree-vectorizer.h (_loop_vec_info::alternate_defs): New member.
	(LOOP_VINFO_ALTERNATE_DEFS): New.
	* tree-vect-stmts.cc (vect_stmt_relevant_p): Populate it.
	(vectorizable_simd_clone_call): Do not register a SLP def
	when there is none.
	* tree-vect-slp.cc (vect_build_slp_tree_1): Allow a NULL
	vectype when there's no LHS.  Allow all calls w/o LHS.
	(vect_analyze_slp): Process LOOP_VINFO_ALTERNATE_DEFS as
	SLP graph entries.
	(vect_make_slp_decision): Handle a NULL SLP_TREE_VECTYPE.
	(vect_slp_analyze_node_operations_1): Likewise.
	(vect_schedule_slp_node): Likewise.

	* gcc.dg/vect/pr59984.c: Adjust.
This commit is contained in:
Richard Biener
2025-08-04 14:45:53 +02:00
committed by Richard Biener
parent 9732b57443
commit 32b1be7eb4
4 changed files with 42 additions and 10 deletions

View File

@@ -64,3 +64,7 @@ main ()
return 0;
}
/* { dg-final { scan-tree-dump "31:17: optimized: loop vectorized" "vect" } } */
/* { dg-final { scan-tree-dump "37:7: optimized: loop vectorized" "vect" } } */
/* { dg-final { scan-tree-dump "44:17: optimized: loop vectorized" "vect" } } */
/* { dg-final { scan-tree-dump "50:7: optimized: loop vectorized" "vect" } } */

View File

@@ -1140,7 +1140,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
soft_fail_nunits_vectype = nunits_vectype;
}
gcc_assert (vectype);
gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
*node_vectype = vectype;
/* For every stmt in NODE find its def stmt/s. */
@@ -1187,10 +1187,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
gcall *call_stmt = dyn_cast <gcall *> (stmt);
tree lhs = gimple_get_lhs (stmt);
if (lhs == NULL_TREE
&& (!call_stmt
|| !gimple_call_internal_p (stmt)
|| !internal_store_fn_p (gimple_call_internal_fn (stmt))))
if (lhs == NULL_TREE && !call_stmt)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -4917,6 +4914,22 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
return opt_result::failure_at (vect_location,
"SLP build failed.\n");
}
stmt_vec_info stmt_info;
FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
{
vec<stmt_vec_info> stmts;
vec<stmt_vec_info> roots = vNULL;
vec<tree> remain = vNULL;
stmts.create (1);
stmts.quick_push (stmt_info);
if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
stmts, roots, remain, max_tree_size,
&limit, bst_map, NULL,
force_single_lane))
return opt_result::failure_at (vect_location,
"SLP build failed.\n");
}
}
if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
@@ -7634,7 +7647,8 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
/* If all instances ended up with vector(1) T roots make sure to
not vectorize. RVV for example relies on loop vectorization
when some instances are essentially kept scalar. See PR121048. */
if (known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
if (SLP_TREE_VECTYPE (root)
&& known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
decided_to_slp++;
}
@@ -7961,7 +7975,10 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
elements in a vector. For single-defuse-cycle, lane-reducing op, and
PHI statement that starts reduction comprised of only lane-reducing ops,
the number is more than effective vector statements actually required. */
SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node);
if (SLP_TREE_VECTYPE (node))
SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node);
else
SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
/* Handle purely internal nodes. */
if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
@@ -11318,8 +11335,10 @@ vect_schedule_slp_node (vec_info *vinfo,
stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
gcc_assert (!SLP_TREE_VECTYPE (node)
|| SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0)
SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
&& STMT_VINFO_DATA_REF (stmt_info))

View File

@@ -386,6 +386,9 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
dump_printf_loc (MSG_NOTE, vect_location,
"vec_stmt_relevant_p: stmt has vdefs.\n");
*relevant = vect_used_in_scope;
if (! STMT_VINFO_DATA_REF (stmt_info)
&& zero_ssa_operands (stmt_info->stmt, SSA_OP_DEF))
LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo).safe_push (stmt_info);
}
/* uses outside the loop. */
@@ -4752,7 +4755,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
}
}
SLP_TREE_VEC_DEFS (slp_node).quick_push (gimple_get_lhs (new_stmt));
if (gimple_get_lhs (new_stmt))
SLP_TREE_VEC_DEFS (slp_node).quick_push (gimple_get_lhs (new_stmt));
}
for (i = 0; i < nargs; ++i)

View File

@@ -947,6 +947,10 @@ public:
stmt in the chain. */
auto_vec<stmt_vec_info> reduction_chains;
/* Defs that could not be analyzed such as OMP SIMD calls without
a LHS. */
auto_vec<stmt_vec_info> alternate_defs;
/* Cost vector for a single scalar iteration. */
auto_vec<stmt_info_for_cost> scalar_cost_vec;
@@ -1186,6 +1190,7 @@ public:
#define LOOP_VINFO_INNER_LOOP_COST_FACTOR(L) (L)->inner_loop_cost_factor
#define LOOP_VINFO_INV_PATTERN_DEF_SEQ(L) (L)->inv_pattern_def_seq
#define LOOP_VINFO_DRS_ADVANCED_BY(L) (L)->drs_advanced_by
#define LOOP_VINFO_ALTERNATE_DEFS(L) (L)->alternate_defs
#define LOOP_VINFO_FULLY_MASKED_P(L) \
(LOOP_VINFO_USING_PARTIAL_VECTORS_P (L) \