mirror of
https://forge.sourceware.org/marek/gcc.git
synced 2026-02-22 03:47:02 -05:00
tree-optimization/123190 - allow VF == 1 epilog vectorization
The following adjusts the condition where we reject vectorization because the scalar loop runs only for a single iteration (or two, in case we need to peel for gaps). Because this is over-eager when considering the case of VF == 1 where instead the cost model should decide wheter it is worthwhile or not. I'm playing conservative here and exclude the case of two iterations as I do not have benchmark evidence. This helps fixing a regression observed with improved SLP handling, not exactly for the options used in the PR though, but for a more common -O3 -march=x86-64-v3 this speeds up 433.milc by 6%. PR tree-optimization/123190 * tree-vect-loop.cc (vect_analyze_loop_costing): Allow vectorizing loops with a single scalar iteration iff the vectorization factor is 1. * gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c: New testcase. * gcc.dg/vect/slp-28.c: Avoid epilogue vectorization for simplicity.
This commit is contained in:
committed by
Richard Biener
parent
9167c9eeea
commit
96bc77e45c
@@ -0,0 +1,38 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-additional-options "-O3 -mavx2 -mno-avx512f -mtune=generic" } */
|
||||
|
||||
typedef struct {
|
||||
double real;
|
||||
double imag;
|
||||
} complex;
|
||||
|
||||
typedef struct { complex e[3][3]; } su3_matrix;
|
||||
|
||||
void mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c ){
|
||||
int i,j;
|
||||
register double t,ar,ai,br,bi,cr,ci;
|
||||
for(i=0;i<3;i++)
|
||||
for(j=0;j<3;j++){
|
||||
|
||||
ar=a->e[i][0].real; ai=a->e[i][0].imag;
|
||||
br=b->e[j][0].real; bi=b->e[j][0].imag;
|
||||
cr=ar*br; t=ai*bi; cr += t;
|
||||
ci=ai*br; t=ar*bi; ci -= t;
|
||||
|
||||
ar=a->e[i][1].real; ai=a->e[i][1].imag;
|
||||
br=b->e[j][1].real; bi=b->e[j][1].imag;
|
||||
t=ar*br; cr += t; t=ai*bi; cr += t;
|
||||
t=ar*bi; ci -= t; t=ai*br; ci += t;
|
||||
|
||||
ar=a->e[i][2].real; ai=a->e[i][2].imag;
|
||||
br=b->e[j][2].real; bi=b->e[j][2].imag;
|
||||
t=ar*br; cr += t; t=ai*bi; cr += t;
|
||||
t=ar*bi; ci -= t; t=ai*br; ci += t;
|
||||
|
||||
c->e[i][j].real=cr;
|
||||
c->e[i][j].imag=ci;
|
||||
}
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump "optimized: loop vectorized using 32" "vect" } } */
|
||||
/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 16 byte vectors and unroll factor 1" "vect" } } */
|
||||
@@ -1,4 +1,5 @@
|
||||
/* { dg-require-effective-target vect_int } */
|
||||
/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
|
||||
|
||||
#include <stdarg.h>
|
||||
#include "tree-vect.h"
|
||||
|
||||
@@ -1792,9 +1792,13 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo,
|
||||
}
|
||||
}
|
||||
/* Reject vectorizing for a single scalar iteration, even if
|
||||
we could in principle implement that using partial vectors. */
|
||||
we could in principle implement that using partial vectors.
|
||||
But allow such vectorization if VF == 1 in case we do not
|
||||
need to peel for gaps (if we need, avoid vectorization for
|
||||
reasons of code footprint). */
|
||||
unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
|
||||
if (scalar_niters <= peeling_gap + 1)
|
||||
if (scalar_niters <= peeling_gap + 1
|
||||
&& (assumed_vf > 1 || peeling_gap != 0))
|
||||
{
|
||||
if (dump_enabled_p ())
|
||||
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
|
||||
|
||||
Reference in New Issue
Block a user