Work in progress for refactoring simd intrinsic

aarch64: Add support for fp8fma instructions
The AArch64 FEAT_FP8FMA extension introduces instructions for multiply-add of vectors. This patch introduces the following instructions: 1. {vmlalbq|vmlaltq}_f16_mf8_fpm. 2. {vmlalbq|vmlaltq}_lane{q}_f16_mf8_fpm. 3. {vmlallbbq|vmlallbtq|vmlalltbq|vmlallttq}_f32_mf8_fpm. 4. {vmlallbbq|vmlallbtq|vmlalltbq|vmlallttq}_lane{q}_f32_mf8_fpm. It introduces the fp8fma flag. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (check_simd_lane_bounds): Add support for new unspecs. (aarch64_expand_pragma_builtins): Add support for new unspecs. * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): New flags. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): New flags. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_FMA_FPM): Macro to declare fma intrinsics. (REQUIRED_EXTENSIONS): Define to declare functions behind command line flags. * config/aarch64/aarch64-simd.md: (@aarch64_<fpm_uns_op><VQ_HSF:mode><VQ_HSF:mode><V16QI_ONLY:mode><V16QI_ONLY:mode): Instruction pattern for fma intrinsics. (@aarch64_<fpm_uns_op><VQ_HSF:mode><VQ_HSF:mode><V16QI_ONLY:mode><VB:mode><SI_ONLY:mode): Instruction pattern for fma intrinsics with lane. * config/aarch64/aarch64.h (TARGET_FP8FMA): New flag for fp8fma instructions. * config/aarch64/iterators.md: New attributes and iterators. * doc/invoke.texi: New flag for fp8fma instructions. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/fma_fpm.c: New test.
2026-02-22 12:00:03 -05:00 · 2024-11-20 10:15:53 +00:00 · 2024-11-14 09:59:41 +00:00 · 2024-11-14 09:59:41 +00:00 · 2024-11-14 09:59:31 +00:00 · 2024-11-14 06:34:13 +00:00
15 changed files with 1644 additions and 835 deletions
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -696,6 +696,7 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = {
  VREINTERPRET_BUILTINS \
  VREINTERPRETQ_BUILTINS

+/* Add fp8 here and in high */
 #define AARCH64_SIMD_VGET_LOW_BUILTINS \
  VGET_LOW_BUILTIN(f16) \
  VGET_LOW_BUILTIN(f32) \
@@ -780,7 +781,7 @@ typedef struct
  AARCH64_SIMD_BUILTIN_##T##_##N##A,

 #undef ENTRY
-#define ENTRY(N, S, M, U) \
+#define ENTRY(N, S, M0, M1, M2, M3, M4, USES_FPMR, U)	\
  AARCH64_##N,

 enum aarch64_builtins
@@ -1590,13 +1591,118 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma)

 enum class aarch64_builtin_signatures
 {
+  unary,
  binary,
+  ternary,
+  quaternary,
 };

+namespace {
+
+struct simd_type {
+  machine_mode mode;
+  aarch64_type_qualifiers qualifiers;
+};
+
+namespace simd_types {
+
+  constexpr simd_type f8 { V8QImode, qualifier_modal_float };
+  constexpr simd_type f8q { V16QImode, qualifier_modal_float };
+
+  constexpr simd_type s8_scalar_const_ptr
+    { QImode, qualifier_const_pointer_map_mode };
+  constexpr simd_type s8_scalar { QImode, qualifier_none };
+  constexpr simd_type s8 { V8QImode, qualifier_none };
+  constexpr simd_type s8q { V16QImode, qualifier_none };
+  constexpr simd_type u8_scalar_const_ptr
+    { QImode, qualifier_const_pointer_map_mode };
+  constexpr simd_type u8_scalar { QImode, qualifier_unsigned };
+  constexpr simd_type u8 { V8QImode, qualifier_unsigned };
+  constexpr simd_type u8q { V16QImode, qualifier_unsigned };
+
+  constexpr simd_type s16_scalar_const_ptr
+    { HImode, qualifier_const_pointer_map_mode };
+  constexpr simd_type s16_scalar { HImode, qualifier_none };
+  constexpr simd_type s16 { V4HImode, qualifier_none };
+  constexpr simd_type u16_scalar_const_ptr
+    { HImode, qualifier_const_pointer_map_mode };
+  constexpr simd_type u16_scalar { HImode, qualifier_unsigned };
+  constexpr simd_type u16 { V4HImode, qualifier_unsigned };
+  constexpr simd_type s16q { V8HImode, qualifier_none };
+  constexpr simd_type u16q { V8HImode, qualifier_unsigned };
+
+  constexpr simd_type s32_scalar_const_ptr
+    { SImode, qualifier_const_pointer_map_mode };
+  constexpr simd_type s32_index { SImode, qualifier_lane_index };
+  constexpr simd_type s32_scalar { SImode, qualifier_none };
+  constexpr simd_type s32 { V2SImode, qualifier_none };
+  constexpr simd_type u32_scalar_const_ptr
+    { SImode, qualifier_const_pointer_map_mode };
+  constexpr simd_type u32_scalar { SImode, qualifier_unsigned };
+  constexpr simd_type u32 { V2SImode, qualifier_unsigned };
+  constexpr simd_type s32q { V4SImode, qualifier_none };
+  constexpr simd_type u32q { V4SImode, qualifier_unsigned };
+
+  constexpr simd_type s64_scalar_const_ptr
+    { DImode, qualifier_const_pointer_map_mode };
+  constexpr simd_type s64_scalar { DImode, qualifier_none };
+  constexpr simd_type s64 { V1DImode, qualifier_none };
+  constexpr simd_type u64_scalar_const_ptr
+    { DImode, qualifier_const_pointer_map_mode };
+  constexpr simd_type u64_scalar { DImode, qualifier_unsigned };
+  constexpr simd_type u64 { V1DImode, qualifier_unsigned };
+  constexpr simd_type s64q { V2DImode, qualifier_none };
+  constexpr simd_type u64q { V2DImode, qualifier_unsigned };
+
+  constexpr simd_type p8_scalar_const_ptr
+    { QImode, qualifier_const_pointer_map_mode };
+  constexpr simd_type p8_scalar { QImode, qualifier_poly };
+  constexpr simd_type p8 { V8QImode, qualifier_poly };
+  constexpr simd_type p8q { V16QImode, qualifier_poly };
+
+  constexpr simd_type p16_scalar_const_ptr
+    { HImode, qualifier_const_pointer_map_mode };
+  constexpr simd_type p16_scalar { HImode, qualifier_poly };
+  constexpr simd_type p16 { V4HImode, qualifier_poly };
+  constexpr simd_type p16q { V8HImode, qualifier_poly };
+
+  constexpr simd_type p64_scalar_const_ptr
+    { DImode, qualifier_const_pointer_map_mode };
+  constexpr simd_type p64_scalar { DImode, qualifier_poly };
+  constexpr simd_type p64 { V1DImode, qualifier_poly };
+  constexpr simd_type p64q { V2DImode, qualifier_poly };
+
+  constexpr simd_type f16_scalar_const_ptr
+    { HFmode, qualifier_const_pointer_map_mode };
+  constexpr simd_type f16_scalar { HFmode, qualifier_none };
+  constexpr simd_type f16 { V4HFmode, qualifier_none };
+  constexpr simd_type f16q { V8HFmode, qualifier_none };
+
+  constexpr simd_type f32_scalar_const_ptr
+    { SFmode, qualifier_const_pointer_map_mode };
+  constexpr simd_type f32_scalar { SFmode, qualifier_none };
+  constexpr simd_type f32 { V2SFmode, qualifier_none };
+  constexpr simd_type f32q { V4SFmode, qualifier_none };
+
+  constexpr simd_type f64_scalar_const_ptr
+    { DFmode, qualifier_const_pointer_map_mode };
+  constexpr simd_type f64_scalar { DFmode, qualifier_none };
+  constexpr simd_type f64 { V1DFmode, qualifier_none };
+  constexpr simd_type f64q { V2DFmode, qualifier_none };
+
+  constexpr simd_type bf16 { V4BFmode, qualifier_none };
+  constexpr simd_type bf16q { V8BFmode, qualifier_none };
+
+  constexpr simd_type none { VOIDmode, qualifier_none };
+}
+
+}
+
 #undef ENTRY
-#define ENTRY(N, S, M, U) \
-  {#N, aarch64_builtin_signatures::S, E_##M##mode, U, \
-   aarch64_required_extensions::REQUIRED_EXTENSIONS},
+#define ENTRY(N, S, T0, T1, T2, T3, T4, USES_FPMR, U)		      \
+  {#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \
+      simd_types::T2, simd_types::T3, simd_types::T4, U,	      \
+      USES_FPMR, aarch64_required_extensions::REQUIRED_EXTENSIONS},

 /* Initialize pragma builtins.  */

@@ -1604,8 +1710,9 @@ struct aarch64_pragma_builtins_data
 {
  const char *name;
  aarch64_builtin_signatures signature;
-  machine_mode mode;
+  simd_type types[5];
  int unspec;
+  bool uses_fpmr;
  aarch64_required_extensions required_extensions;
 };

@@ -1613,17 +1720,42 @@ static aarch64_pragma_builtins_data aarch64_pragma_builtins[] = {
 #include "aarch64-simd-pragma-builtins.def"
 };

+static unsigned int
+aarch64_get_number_of_args (const aarch64_pragma_builtins_data &builtin_data)
+{
+  if (builtin_data.signature == aarch64_builtin_signatures::unary)
+    return 1;
+  else if (builtin_data.signature == aarch64_builtin_signatures::binary)
+    return 2;
+  else if (builtin_data.signature == aarch64_builtin_signatures::ternary)
+    return 3;
+  else if (builtin_data.signature == aarch64_builtin_signatures::quaternary)
+    return 4;
+  else
+    // No other signature supported.
+    gcc_unreachable ();
+}
+
 static tree
 aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data)
 {
-  auto type = aarch64_simd_builtin_type (builtin_data.mode, qualifier_none);
-  switch (builtin_data.signature)
+  tree return_type
+    = aarch64_simd_builtin_type (builtin_data.types[0].mode,
+				 builtin_data.types[0].qualifiers);
+
+  vec<tree, va_gc> *arg_types = NULL;
+  auto nargs = aarch64_get_number_of_args (builtin_data);
+  for (unsigned int i = 1; i <= nargs; ++i)
    {
-    case aarch64_builtin_signatures::binary:
-      return build_function_type_list (type, type, type, NULL_TREE);
-    default:
-      gcc_unreachable ();
+      auto type = aarch64_simd_builtin_type (builtin_data.types[i].mode,
+					     builtin_data.types[i].qualifiers);
+      vec_safe_push (arg_types, type);
    }
+
+  if (builtin_data.uses_fpmr == true)
+    vec_safe_push (arg_types, uint64_type_node);
+
+  return build_function_type_vec (return_type, arg_types);
 }

 static void
@@ -2431,6 +2563,88 @@ aarch64_general_required_extensions (unsigned int code)
  return ext::streaming_compatible (0);
 }

+namespace function_checker {
+
+void
+require_integer_constant (location_t location, tree arg)
+{
+  if (TREE_CODE (arg) != INTEGER_CST)
+    {
+      error_at (location, "Constant-type integer argument expected");
+      return;
+    }
+}
+
+void
+require_immediate_range (location_t location, tree arg, HOST_WIDE_INT min,
+			 HOST_WIDE_INT max)
+{
+  if (wi::to_widest (arg) < min || wi::to_widest (arg) > max)
+    {
+      error_at (location, "lane out of range %wd - %wd", min, max);
+      return;
+    }
+}
+
+/* Validates indexing into a vector using the index's size and the instruction,
+   where instruction is represented by the unspec.
+   This only works for intrinsics declared using pragmas in
+   aarch64-simd-pragma-builtins.def.  */
+
+void
+check_simd_lane_bounds (location_t location, const aarch64_pragma_builtins_data
+			*builtin_data, tree *args)
+{
+  if (builtin_data == NULL)
+    // Don't check for functions that are not declared in
+    // aarch64-simd-pragma-builtins.def.
+    return;
+
+  auto nargs = aarch64_get_number_of_args (*builtin_data);
+  switch (builtin_data->unspec)
+    {
+    case UNSPEC_VDOT2:
+    case UNSPEC_VDOT4:
+      {
+	if (builtin_data->types[nargs].qualifiers != qualifier_lane_index)
+	  break;
+
+	auto index_arg = args[nargs - 1];
+	require_integer_constant (location, index_arg);
+
+	auto vector_to_index_mode = builtin_data->types[nargs - 1].mode;
+	int vector_to_index_mode_size
+	  = GET_MODE_NUNITS (vector_to_index_mode).to_constant ();
+
+	auto low = 0;
+	int high;
+	switch (builtin_data->unspec)
+	  {
+	  case UNSPEC_VDOT2:
+	    high = vector_to_index_mode_size / 2 - 1;
+	    break;
+	  case UNSPEC_VDOT4:
+	    high = vector_to_index_mode_size / 4 - 1;
+	    break;
+	  case UNSPEC_FMLALB:
+	  case UNSPEC_FMLALT:
+	  case UNSPEC_FMLALLBB:
+	  case UNSPEC_FMLALLBT:
+	  case UNSPEC_FMLALLTB:
+	  case UNSPEC_FMLALLTT:
+	    high = vector_to_index_mode_size - 1;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	  }
+	require_immediate_range (location, index_arg, low, high);
+	break;
+      }
+    }
+}
+
+};
+
 bool
 aarch64_general_check_builtin_call (location_t location, vec<location_t>,
 				    unsigned int code, tree fndecl,
@@ -2442,6 +2656,9 @@ aarch64_general_check_builtin_call (location_t location, vec<location_t>,
  if (!aarch64_check_required_extensions (location, decl, required_extensions))
    return false;

+  auto builtin_data = aarch64_get_pragma_builtin (code);
+  function_checker::check_simd_lane_bounds (location, builtin_data, args);
+
  switch (code)
    {
    case AARCH64_RSR:
@@ -3336,17 +3553,184 @@ static rtx
 aarch64_expand_pragma_builtin (tree exp, rtx target,
 			       const aarch64_pragma_builtins_data *builtin_data)
 {
-  expand_operand ops[3];
-  auto mode = builtin_data->mode;
-  auto op1 = expand_normal (CALL_EXPR_ARG (exp, 0));
-  auto op2 = expand_normal (CALL_EXPR_ARG (exp, 1));
-  create_output_operand (&ops[0], target, mode);
-  create_input_operand (&ops[1], op1, mode);
-  create_input_operand (&ops[2], op2, mode);
+  auto nargs = aarch64_get_number_of_args (*builtin_data);

-  auto unspec = builtin_data->unspec;
-  auto icode = code_for_aarch64 (unspec, mode);
-  expand_insn (icode, 3, ops);
+  expand_operand ops[5];
+  create_output_operand (&ops[0], target, builtin_data->types[0].mode);
+  for (unsigned int i = 1; i <= nargs; ++i)
+    create_input_operand (&ops[i],
+			  expand_normal (CALL_EXPR_ARG (exp, i - 1)),
+			  builtin_data->types[i].mode);
+
+  if (builtin_data->uses_fpmr == true)
+    {
+      auto fpm_input = expand_normal (CALL_EXPR_ARG (exp, nargs));
+      auto fpmr = gen_rtx_REG (DImode, FPM_REGNUM);
+      emit_move_insn (fpmr, fpm_input);
+    }
+
+  enum insn_code icode;
+  switch (builtin_data->unspec)
+    {
+    case UNSPEC_FAMAX:
+    case UNSPEC_FAMIN:
+      icode = code_for_aarch64 (builtin_data->unspec,
+				builtin_data->types[0].mode);
+      expand_insn (icode, nargs + 1, ops);
+      break;
+
+    case UNSPEC_VCVT1:
+    case UNSPEC_VCVT1_HIGH:
+    case UNSPEC_VCVT2:
+    case UNSPEC_VCVT2_HIGH:
+      icode = code_for_aarch64 (builtin_data->unspec,
+				builtin_data->types[0].mode,
+				builtin_data->types[1].mode);
+      expand_insn (icode, nargs + 1, ops);
+      break;
+
+    case UNSPEC_VCVT1_LOW:
+    case UNSPEC_VCVT2_LOW:
+      icode = code_for_aarch64_lower (builtin_data->unspec,
+				      builtin_data->types[0].mode,
+				      builtin_data->types[1].mode);
+      expand_insn (icode, nargs + 1, ops);
+      break;
+
+    case UNSPEC_FSCALE:
+      icode = code_for_aarch64 (builtin_data->unspec,
+				builtin_data->types[1].mode,
+				builtin_data->types[2].mode);
+      expand_insn (icode, nargs + 1, ops);
+      break;
+
+    case UNSPEC_VCVT:
+      icode = code_for_aarch64 (builtin_data->unspec,
+				builtin_data->types[0].mode,
+				builtin_data->types[1].mode,
+				builtin_data->types[2].mode);
+      expand_insn (icode, nargs + 1, ops);
+      break;
+
+    case UNSPEC_VCVT_HIGH:
+      icode = code_for_aarch64 (builtin_data->unspec,
+				builtin_data->types[0].mode,
+				builtin_data->types[1].mode,
+				builtin_data->types[2].mode,
+				builtin_data->types[3].mode);
+      expand_insn (icode, nargs + 1, ops);
+      break;
+
+    case UNSPEC_VDOT2:
+    case UNSPEC_VDOT4:
+    case UNSPEC_FMLALB:
+    case UNSPEC_FMLALT:
+    case UNSPEC_FMLALLBB:
+    case UNSPEC_FMLALLBT:
+    case UNSPEC_FMLALLTB:
+    case UNSPEC_FMLALLTT:
+      if (builtin_data->signature == aarch64_builtin_signatures::ternary)
+	icode = code_for_aarch64 (builtin_data->unspec,
+				  builtin_data->types[0].mode,
+				  builtin_data->types[1].mode,
+				  builtin_data->types[2].mode,
+				  builtin_data->types[3].mode);
+      else if
+	(builtin_data->signature == aarch64_builtin_signatures::quaternary)
+	icode = code_for_aarch64 (builtin_data->unspec,
+				  builtin_data->types[0].mode,
+				  builtin_data->types[1].mode,
+				  builtin_data->types[2].mode,
+				  builtin_data->types[3].mode,
+				  builtin_data->types[4].mode);
+      else
+	gcc_unreachable ();
+
+      expand_insn (icode, nargs + 1, ops);
+      target = ops[0].value;
+      break;
+
+    case UNSPEC_VCREATE:
+      target = force_lowpart_subreg (builtin_data->types[0].mode,
+				     expand_normal (CALL_EXPR_ARG (exp, 0)),
+				     DImode);
+      break;
+
+    case UNSPEC_VEC_COPY:
+      {
+	/* Need to do lane checks here. */
+	/* Also need to set indexes correctly here.  */
+	expand_operand vget_ops[3];
+	rtx vget_target;
+	auto vget_output_mode = GET_MODE_INNER (builtin_data->types[0].mode);
+	create_output_operand (&vget_ops[0], vget_target, vget_output_mode);
+	vget_ops[1] = ops[3];
+	vget_ops[2] = ops[4];
+	auto vget_icode = code_for_aarch64_get_lane (builtin_data->types[0].mode);
+	expand_insn (vget_icode, 3, vget_ops);
+	vget_target = vget_ops[0].value;
+
+	expand_operand vset_ops[4];
+	create_output_operand (&vset_ops[0],
+			       target,
+			       builtin_data->types[0].mode);
+	vset_ops[1] = vget_ops[0];
+	vset_ops[2] = ops[2];
+	vset_ops[3] = ops[1];
+	auto vset_icode = code_for_aarch64_simd_vec_set (builtin_data->types[0].mode);
+	expand_insn (vset_icode, 4, vset_ops);
+
+	target = vset_ops[0].value;	
+	break;
+      }
+
+    case UNSPEC_DUP:
+      target = expand_vector_broadcast (builtin_data->types[0].mode,
+					expand_normal (CALL_EXPR_ARG (exp, 0)));
+      break;
+
+    case UNSPEC_DUPB:
+      icode = code_for_aarch64_get_lane (builtin_data->types[1].mode);
+      expand_insn (icode, nargs + 1, ops);
+      target = ops[0].value;
+      break;
+
+    case UNSPEC_LD1:
+      {
+	if (builtin_data->types[0].mode == V1DFmode)
+	  target = expand_vector_broadcast (builtin_data->types[0].mode,
+					expand_normal (CALL_EXPR_ARG (exp, 0)));
+	else
+	  {
+	    icode = code_for_aarch64_ld1 (builtin_data->types[0].mode);
+	    auto input
+	      = convert_memory_address (Pmode,
+					expand_normal (CALL_EXPR_ARG (exp, 0)));
+	    create_input_operand (&ops[1], input, Pmode);
+	    expand_insn (icode, nargs + 1, ops);	    
+	  }
+	target = ops[0].value;
+	break;
+      }
+
+    case UNSPEC_DUP_LANE:
+      {
+	/* We need to do lane checks here.  */
+	auto lane = INTVAL (expand_normal (CALL_EXPR_ARG (exp, 1)));
+	auto vector_mode = builtin_data->types[1].mode;
+	auto nunits = GET_MODE_NUNITS (vector_mode).to_constant ();
+	create_input_operand(&ops[2],
+			     gen_int_mode ((ENDIAN_LANE_N (nunits, lane)),
+					   SImode),
+			     SImode);
+	icode = code_for_aarch64_dup_lane (builtin_data->types[0].mode);
+	expand_insn (icode, nargs + 1, ops);
+	target = ops[0].value;
+	break;
+      }
+    default:
+      gcc_unreachable ();
+    }

  return target;
 }
@@ -4186,7 +4570,6 @@ aarch64_resolve_overloaded_builtin_general (location_t loc, tree function,
 #undef CF3
 #undef CF4
 #undef CF10
-#undef ENTRY_VHSDF
 #undef VAR1
 #undef VAR2
 #undef VAR3
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -258,6 +258,14 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
  aarch64_def_or_undef (TARGET_SVE_BF16,
 			"__ARM_FEATURE_SVE_BF16", pfile);

+  aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
+
+  aarch64_def_or_undef (TARGET_FP8DOT2, "__ARM_FEATURE_FP8DOT2", pfile);
+
+  aarch64_def_or_undef (TARGET_FP8DOT4, "__ARM_FEATURE_FP8DOT4", pfile);
+
+  aarch64_def_or_undef (TARGET_FP8FMA, "__ARM_FEATURE_FP8FMA", pfile);
+
  aarch64_def_or_undef (TARGET_LS64,
 			"__ARM_FEATURE_LS64", pfile);
  aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile);
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -236,6 +236,12 @@ AARCH64_OPT_EXTENSION("gcs", GCS, (), (), (), "gcs")

 AARCH64_OPT_EXTENSION("fp8", FP8, (SIMD), (), (), "fp8")

+AARCH64_OPT_EXTENSION("fp8dot2", FP8DOT2, (SIMD), (), (), "fp8dot2")
+
+AARCH64_OPT_EXTENSION("fp8dot4", FP8DOT4, (SIMD), (), (), "fp8dot4")
+
+AARCH64_OPT_EXTENSION("fp8fma", FP8FMA, (SIMD), (), (), "fp8fma")
+
 AARCH64_OPT_EXTENSION("faminmax", FAMINMAX, (SIMD), (), (), "faminmax")

 #undef AARCH64_OPT_FMV_EXTENSION
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -18,16 +18,263 @@
   along with GCC; see the file COPYING3.  If not see
   <http://www.gnu.org/licenses/>.  */

+
+#undef ENTRY_BINARY
+#define ENTRY_BINARY(N, T0, T1, T2, U)			\
+  ENTRY (N, binary, T0, T1, T2, none, none, false, U)
+
+#undef ENTRY_BINARY_FPM
+#define ENTRY_BINARY_FPM(N, T0, T1, T2, U)		\
+  ENTRY (N, binary, T0, T1, T2, none, none, true, U)
+
+#undef ENTRY_BINARY_TWO_LANES
+#define ENTRY_BINARY_TWO_LANES(N, T0, T1, T2, U)			\
+  ENTRY (N, quaternary, T0, T1, s32_index, T2, s32_index, false, U)
+
+#undef ENTRY_TERNARY_FPM
+#define ENTRY_TERNARY_FPM(N, T0, T1, T2, T3, U)		\
+  ENTRY (N, ternary, T0, T1, T2, T3, none, true, U)
+
+#undef ENTRY_TERNARY_FPM_LANE
+#define ENTRY_TERNARY_FPM_LANE(N, T0, T1, T2, T3, U)	\
+  ENTRY (N, quaternary, T0, T1, T2, T3, s32_index, true, U)
+
+#undef ENTRY_UNARY
+#define ENTRY_UNARY(N, T0, T1, U)		\
+  ENTRY (N, unary, T0, T1, none, none, none, false, U)
+
+#undef ENTRY_UNARY_LANE
+#define ENTRY_UNARY_LANE(N, T0, T1, U)			\
+  ENTRY_BINARY (N, T0, T1, s32_index, U)		\
+
+#undef ENTRY_UNARY_FPM
+#define ENTRY_UNARY_FPM(N, T0, T1, U)			\
+  ENTRY (N, unary, T0, T1, none, none, none, true, U)
+
+#undef ENTRY_VDOT_FPM
+#define ENTRY_VDOT_FPM(T, U)						\
+  ENTRY_TERNARY_FPM (vdot_##T##_mf8_fpm, T, T, f8, f8, U)		\
+  ENTRY_TERNARY_FPM (vdotq_##T##_mf8_fpm, T##q, T##q, f8q, f8q, U)	\
+  ENTRY_TERNARY_FPM_LANE (vdot_lane_##T##_mf8_fpm, T, T, f8, f8, U)	\
+  ENTRY_TERNARY_FPM_LANE (vdot_laneq_##T##_mf8_fpm, T, T, f8, f8q, U)	\
+  ENTRY_TERNARY_FPM_LANE (vdotq_lane_##T##_mf8_fpm, T##q, T##q, f8q, f8, U) \
+  ENTRY_TERNARY_FPM_LANE (vdotq_laneq_##T##_mf8_fpm, T##q, T##q, f8q, f8q, U)
+
+#undef ENTRY_FMA_FPM
+#define ENTRY_FMA_FPM(N, T, U)						\
+  ENTRY_TERNARY_FPM (N##_##T##_mf8_fpm, T##q, T##q, f8q, f8q, U)	\
+  ENTRY_TERNARY_FPM_LANE (N##_lane_##T##_mf8_fpm, T##q, T##q, f8q, f8, U) \
+  ENTRY_TERNARY_FPM_LANE (N##_laneq_##T##_mf8_fpm, T##q, T##q, f8q, f8q, U)
+
 #undef ENTRY_VHSDF
-#define ENTRY_VHSDF(NAME, SIGNATURE, UNSPEC) \
-  ENTRY (NAME##_f16, SIGNATURE, V4HF, UNSPEC) \
-  ENTRY (NAME##q_f16, SIGNATURE, V8HF, UNSPEC) \
-  ENTRY (NAME##_f32, SIGNATURE, V2SF, UNSPEC) \
-  ENTRY (NAME##q_f32, SIGNATURE, V4SF, UNSPEC) \
-  ENTRY (NAME##q_f64, SIGNATURE, V2DF, UNSPEC)
+#define ENTRY_VHSDF(NAME, UNSPEC) \
+  ENTRY_BINARY (NAME##_f16, f16, f16, f16, UNSPEC)		\
+  ENTRY_BINARY (NAME##q_f16, f16q, f16q, f16q, UNSPEC)		\
+  ENTRY_BINARY (NAME##_f32, f32, f32, f32, UNSPEC)		\
+  ENTRY_BINARY (NAME##q_f32, f32q, f32q, f32q, UNSPEC)		\
+  ENTRY_BINARY (NAME##q_f64, f64q, f64q, f64q, UNSPEC)
+
+#undef ENTRY_VHSDF_VHSDI
+#define ENTRY_VHSDF_VHSDI(NAME, UNSPEC)			\
+  ENTRY_BINARY (NAME##_f16, f16, f16, s16, UNSPEC)	\
+  ENTRY_BINARY (NAME##q_f16, f16q, f16q, s16q, UNSPEC)	\
+  ENTRY_BINARY (NAME##_f32, f32, f32, s32, UNSPEC)	\
+  ENTRY_BINARY (NAME##q_f32, f32q, f32q, s32q, UNSPEC)	\
+  ENTRY_BINARY (NAME##q_f64, f64q, f64q, s64q, UNSPEC)
+
+#undef ENTRY_UNARY_N_VALL_F16_SCALAR
+#define ENTRY_UNARY_N_VALL_F16_SCALAR(NAME, UNSPEC)		\
+  ENTRY_UNARY (NAME##_n_p8, p8, p8_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_p8, p8q, p8_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_p16, p16, p16_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_p16, p16q, p16_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_p64, p64, p64_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_p64, p64q, p64_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_s8, s8, s8_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_s8, s8q, s8_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_s16, s16, s16_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_s16, s16q, s16_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_s32, s32, s32_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_s32, s32q, s32_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_s64, s64, s64_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_s64, s64q, s64_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_u8, u8, u8_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_u8, u8q, u8_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_u16, u16, u16_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_u16, u16q, u16_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_u32, u32, u32_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_u32, u32q, u32_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_u64, u64, u64_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_u64, u64q, u64_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_f16, f16, f16_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_f16, f16q, f16_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_f32, f32, f32_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_f32, f32q, f32_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##_n_f64, f64, f64_scalar, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_n_f64, f64q, f64_scalar, UNSPEC)	\
+
+#undef ENTRY_UNARY_VALL_F16_CONST_PTR
+#define ENTRY_UNARY_VALL_F16_CONST_PTR(NAME, UNSPEC)		\
+  ENTRY_UNARY (NAME##_p8, p8, p8_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_p8, p8q, p8_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_p16, p16, p16_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_p16, p16q, p16_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_p64, p64, p64_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_p64, p64q, p64_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_s8, s8, s8_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_s8, s8q, s8_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_s16, s16, s16_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_s16, s16q, s16_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_s32, s32, s32_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_s32, s32q, s32_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_s64, s64, s64_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_s64, s64q, s64_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_u8, u8, u8_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_u8, u8q, u8_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_u16, u16, u16_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_u16, u16q, u16_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_u32, u32, u32_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_u32, u32q, u32_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_u64, u64, u64_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_u64, u64q, u64_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_f16, f16, f16_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_f16, f16q, f16_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_f32, f32, f32_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_f32, f32q, f32_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##_f64, f64, f64_scalar_const_ptr, UNSPEC)	\
+  ENTRY_UNARY (NAME##q_f64, f64q, f64_scalar_const_ptr, UNSPEC)	\
+
+#undef ENTRY_UNARY_LANE_VALL_F16
+#define ENTRY_UNARY_LANE_VALL_F16(NAME, UNSPEC)				\
+  ENTRY_UNARY_LANE (NAME##_lane_p8, p8, p8, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_p8, p8, p8q, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_lane_p16, p16, p16, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_p16, p16, p16q, UNSPEC)		\
+  ENTRY_UNARY_LANE (NAME##_lane_p64, p64, p64, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_p64, p64, p64q, UNSPEC)		\
+  ENTRY_UNARY_LANE (NAME##_lane_s8, s8, s8, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_s8, s8, s8q, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_lane_s16, s16, s16, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_s16, s16, s16q, UNSPEC)		\
+  ENTRY_UNARY_LANE (NAME##_lane_s32, s32, s32, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_s32, s32, s32q, UNSPEC)		\
+  ENTRY_UNARY_LANE (NAME##_lane_s64, s64, s64, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_s64, s64, s64q, UNSPEC)		\
+  ENTRY_UNARY_LANE (NAME##_lane_u8, u8, u8, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_u8, u8, u8q, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_lane_u16, u16, u16, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_u16, u16, u16q, UNSPEC)		\
+  ENTRY_UNARY_LANE (NAME##_lane_u32, u32, u32, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_u32, u32, u32q, UNSPEC)		\
+  ENTRY_UNARY_LANE (NAME##_lane_u64, u64, u64, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_u64, u64, u64q, UNSPEC)		\
+  ENTRY_UNARY_LANE (NAME##_lane_f16, f16, f16, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_f16, f16, f16q, UNSPEC)		\
+  ENTRY_UNARY_LANE (NAME##_lane_f32, f32, f32, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_f32, f32, f32q, UNSPEC)		\
+  ENTRY_UNARY_LANE (NAME##_lane_f64, f64, f64, UNSPEC)			\
+  ENTRY_UNARY_LANE (NAME##_laneq_f64, f64, f64q, UNSPEC)		\

 // faminmax
 #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX)
-ENTRY_VHSDF (vamax, binary, UNSPEC_FAMAX)
-ENTRY_VHSDF (vamin, binary, UNSPEC_FAMIN)
+ENTRY_VHSDF (vamax, UNSPEC_FAMAX)
+ENTRY_VHSDF (vamin, UNSPEC_FAMIN)
+#undef REQUIRED_EXTENSIONS
+
+// fpm conversion
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
+ENTRY_UNARY_FPM (vcvt1_bf16_mf8_fpm, bf16q, f8, UNSPEC_VCVT1)
+ENTRY_UNARY_FPM (vcvt1_high_bf16_mf8_fpm, bf16q, f8q, UNSPEC_VCVT1_HIGH)
+ENTRY_UNARY_FPM (vcvt1_low_bf16_mf8_fpm, bf16q, f8q, UNSPEC_VCVT1_LOW)
+ENTRY_UNARY_FPM (vcvt1_f16_mf8_fpm, f16q, f8, UNSPEC_VCVT1)
+ENTRY_UNARY_FPM (vcvt1_high_f16_mf8_fpm, f16q, f8q, UNSPEC_VCVT1_HIGH)
+ENTRY_UNARY_FPM (vcvt1_low_f16_mf8_fpm, f16q, f8q, UNSPEC_VCVT1_LOW)
+ENTRY_UNARY_FPM (vcvt2_bf16_mf8_fpm, bf16q, f8, UNSPEC_VCVT2)
+ENTRY_UNARY_FPM (vcvt2_high_bf16_mf8_fpm, bf16q, f8q, UNSPEC_VCVT2_HIGH)
+ENTRY_UNARY_FPM (vcvt2_low_bf16_mf8_fpm, bf16q, f8q, UNSPEC_VCVT2_LOW)
+ENTRY_UNARY_FPM (vcvt2_f16_mf8_fpm, f16q, f8, UNSPEC_VCVT2)
+ENTRY_UNARY_FPM (vcvt2_high_f16_mf8_fpm, f16q, f8q, UNSPEC_VCVT2_HIGH)
+ENTRY_UNARY_FPM (vcvt2_low_f16_mf8_fpm, f16q, f8q, UNSPEC_VCVT2_LOW)
+
+ENTRY_BINARY_FPM (vcvt_mf8_f16_fpm, f8, f16, f16, UNSPEC_VCVT)
+ENTRY_BINARY_FPM (vcvtq_mf8_f16_fpm, f8q, f16q, f16q, UNSPEC_VCVT)
+ENTRY_BINARY_FPM (vcvt_mf8_f32_fpm, f8, f32q, f32q, UNSPEC_VCVT)
+
+ENTRY_TERNARY_FPM (vcvt_high_mf8_f32_fpm, f8q, f8, f32q, f32q, UNSPEC_VCVT_HIGH)
+#undef REQUIRED_EXTENSIONS
+
+// fpm scaling
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
+ENTRY_VHSDF_VHSDI (vscale, UNSPEC_FSCALE)
+#undef REQUIRED_EXTENSIONS
+
+// fpm dot2 product
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT2)
+ENTRY_VDOT_FPM (f16, UNSPEC_VDOT2)
+#undef REQUIRED_EXTENSIONS
+
+// fpm dot4 product
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT4)
+ENTRY_VDOT_FPM (f32, UNSPEC_VDOT4)
+#undef REQUIRED_EXTENSIONS
+
+// fp8 multiply-add
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8FMA)
+ENTRY_FMA_FPM (vmlalbq, f16, UNSPEC_FMLALB)
+ENTRY_FMA_FPM (vmlaltq, f16, UNSPEC_FMLALT)
+ENTRY_FMA_FPM (vmlallbbq, f32, UNSPEC_FMLALLBB)
+ENTRY_FMA_FPM (vmlallbtq, f32, UNSPEC_FMLALLBT)
+ENTRY_FMA_FPM (vmlalltbq, f32, UNSPEC_FMLALLTB)
+ENTRY_FMA_FPM (vmlallttq, f32, UNSPEC_FMLALLTT)
+#undef REQUIRED_EXTENSIONS
+
+// dup
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_SIMD)
+ENTRY_UNARY_N_VALL_F16_SCALAR (vdup, UNSPEC_DUP)
+ENTRY_UNARY_LANE_VALL_F16 (vdup, UNSPEC_DUP_LANE)
+#undef REQUIRED_EXTENSIONS
+
+// mov
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_SIMD)
+ENTRY_UNARY_N_VALL_F16_SCALAR (vmov, UNSPEC_DUP)
+#undef REQUIRED_EXTENSIONS
+
+// vcreate
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_SIMD)
+ENTRY_UNARY (vcreate_p8, p8, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_p16, p16, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_p64, p64, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_s8, s8, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_s16, s16, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_s32, s32, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_s64, s64, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_u8, u8, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_u16, u16, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_u32, u32, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_u64, u64, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_f16, f16, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_f32, f32, u64_scalar, UNSPEC_VCREATE)
+ENTRY_UNARY (vcreate_f64, f64, u64_scalar, UNSPEC_VCREATE)
+#undef REQUIRED_EXTENSIONS
+
+// vcopy_lane
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_SIMD)
+ENTRY_BINARY_TWO_LANES (vcopy_lane_p8, p8, p8, p8, UNSPEC_VEC_COPY)
+ENTRY_BINARY_TWO_LANES (vcopy_lane_s8, s8, s8, s8, UNSPEC_VEC_COPY)
+ENTRY_BINARY_TWO_LANES (vcopy_lane_u8, u8, u8, u8, UNSPEC_VEC_COPY)
+#undef REQUIRED_EXTENSIONS
+
+// vdupb_lane
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_SIMD)
+ENTRY_UNARY_LANE (vdupb_lane_s8, s8_scalar, s8, UNSPEC_DUPB)
+ENTRY_UNARY_LANE (vdupb_lane_u8, u8_scalar, u8, UNSPEC_DUPB)
+ENTRY_UNARY_LANE (vdupb_lane_p8, p8_scalar, p8, UNSPEC_DUPB)
+ENTRY_UNARY_LANE (vdupb_laneq_s8, s8_scalar, s8q, UNSPEC_DUPB)
+ENTRY_UNARY_LANE (vdupb_laneq_u8, u8_scalar, u8q, UNSPEC_DUPB)
+ENTRY_UNARY_LANE (vdupb_laneq_p8, p8_scalar, p8q, UNSPEC_DUPB)
+#undef REQUIRED_EXTENSIONS
+
+// ld1
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_SIMD)
+ENTRY_UNARY_VALL_F16_CONST_PTR (vld1, UNSPEC_LD1)
 #undef REQUIRED_EXTENSIONS
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -112,7 +112,7 @@
  }
 )

-(define_insn "aarch64_dup_lane<mode>"
+(define_insn "@aarch64_dup_lane<mode>"
  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_duplicate:VALL_F16
 	  (vec_select:<VEL>
@@ -121,6 +121,7 @@
          )))]
  "TARGET_SIMD"
  {
+    /* TODO: Need to use ENDIAN_LANE_N this in existing intrinsics too. We still need the next line. */
    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
    return "dup\\t%0.<Vtype>, %1.<Vetype>[%2]";
  }
@@ -1164,7 +1165,7 @@
  [(set_attr "type" "neon_logic<q>")]
 )

-(define_insn "aarch64_simd_vec_set<mode>"
+(define_insn "@aarch64_simd_vec_set<mode>"
  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
 	(vec_merge:VALL_F16
 	    (vec_duplicate:VALL_F16
@@ -1178,9 +1179,9 @@
   switch (which_alternative)
     {
     case 0:
-	return "ins\\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
+	return "ins1\\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
     case 1:
-	return "ins\\t%0.<Vetype>[%p2], %<vwcore>1";
+	return "ins2\\t%0.<Vetype>[%p2], %<vwcore>1";
     case 2:
        return "ld1\\t{%0.<Vetype>}[%p2], %1";
     default:
@@ -1190,7 +1191,7 @@
  [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
 )

-(define_insn "aarch64_simd_vec_set_zero<mode>"
+(define_insn "@aarch64_simd_vec_set_zero<mode>"
  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_merge:VALL_F16
 	    (match_operand:VALL_F16 1 "aarch64_simd_imm_zero" "")
@@ -1200,7 +1201,7 @@
  {
    int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2])));
    operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt);
-    return "ins\\t%0.<Vetype>[%p2], <vwcore>zr";
+    return "ins3\\t%0.<Vetype>[%p2], <vwcore>zr";
  }
 )

@@ -1220,7 +1221,7 @@
    operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt);
    operands[4] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[4]));

-    return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
+    return "ins4\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
  }
  [(set_attr "type" "neon_ins<q>")]
 )
@@ -1242,7 +1243,7 @@
    operands[4] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode,
 					   INTVAL (operands[4]));

-    return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
+    return "ins5\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
  }
  [(set_attr "type" "neon_ins<q>")]
 )
@@ -4357,7 +4358,7 @@
 ;; RTL uses GCC vector extension indices throughout so flip only for assembly.
 ;; Extracting lane zero is split into a simple move when it is between SIMD
 ;; registers or a store.
-(define_insn_and_split "aarch64_get_lane<mode>"
+(define_insn_and_split "@aarch64_get_lane<mode>"
  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv")
 	(vec_select:<VEL>
 	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
@@ -8401,7 +8402,7 @@
  DONE;
 })

-(define_expand "aarch64_ld1<VALL_F16:mode>"
+(define_expand "@aarch64_ld1<VALL_F16:mode>"
 [(match_operand:VALL_F16 0 "register_operand")
  (match_operand:DI 1 "register_operand")]
  "TARGET_SIMD"
@@ -9999,3 +10000,188 @@
  "TARGET_FAMINMAX"
  "<faminmax_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
 )
+
+;; fpm unary instructions for brain float modes.
+(define_insn "@aarch64_<fpm_unary_bf_uns_op><V8BF_ONLY:mode><VB:mode>"
+  [(set (match_operand:V8BF_ONLY 0 "register_operand" "=w")
+	(unspec:V8BF_ONLY
+	 [(match_operand:VB 1 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_UNARY_UNS))]
+  "TARGET_FP8"
+  "<fpm_unary_bf_uns_op>\t%0.<V8BF_ONLY:Vtype>, %1.<VB:Vtype>"
+)
+
+;; fpm unary instructions for half float modes.
+(define_insn "@aarch64_<fpm_unary_hf_uns_op><V8HF_ONLY:mode><VB:mode>"
+  [(set (match_operand:V8HF_ONLY 0 "register_operand" "=w")
+	(unspec:V8HF_ONLY
+	 [(match_operand:VB 1 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_UNARY_UNS))]
+  "TARGET_FP8"
+  "<fpm_unary_hf_uns_op>\t%0.<V8HF_ONLY:Vtype>, %1.<VB:Vtype>"
+)
+
+;; fpm unary instructions for brain float modes, where the input is
+;; lowered from V16QI to V8QI.
+(define_insn
+  "@aarch64_lower_<fpm_unary_bf_uns_op><V8BF_ONLY:mode><V16QI_ONLY:mode>"
+  [(set (match_operand:V8BF_ONLY 0 "register_operand" "=w")
+	(unspec:V8BF_ONLY
+	 [(match_operand:V16QI_ONLY 1 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_UNARY_LOW_UNS))]
+  "TARGET_FP8"
+  {
+    operands[1] = force_lowpart_subreg (V8QImode,
+					operands[1],
+					recog_data.operand[1]->mode);
+    return "<fpm_unary_bf_uns_op>\t%0.<V8BF_ONLY:Vtype>, %1.8b";
+  }
+)
+
+;; fpm unary instructions for half float modes, where the input is
+;; lowered from V16QI to V8QI.
+(define_insn
+  "@aarch64_lower_<fpm_unary_hf_uns_op><V8HF_ONLY:mode><V16QI_ONLY:mode>"
+  [(set (match_operand:V8HF_ONLY 0 "register_operand" "=w")
+	(unspec:V8HF_ONLY
+	 [(match_operand:V16QI_ONLY 1 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_UNARY_LOW_UNS))]
+  "TARGET_FP8"
+  {
+    operands[1] = force_lowpart_subreg (V8QImode,
+					operands[1],
+					recog_data.operand[1]->mode);
+    return "<fpm_unary_hf_uns_op>\t%0.<V8HF_ONLY:Vtype>, %1.8b";
+  }
+)
+
+;; fpm binary instructions.
+(define_insn
+  "@aarch64_<fpm_uns_op><VB:mode><VCVTFPM:mode><VH_SF:mode>"
+  [(set (match_operand:VB 0 "register_operand" "=w")
+	(unspec:VB
+	 [(match_operand:VCVTFPM 1 "register_operand" "w")
+	  (match_operand:VH_SF 2 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_BINARY_UNS))]
+  "TARGET_FP8"
+  "<fpm_uns_op>\t%0.<VB:Vtype>, %1.<VCVTFPM:Vtype>, %2.<VH_SF:Vtype>"
+)
+
+;; fpm ternary instructions.
+(define_insn
+  "@aarch64_<fpm_uns_op><V16QI_ONLY:mode><V8QI_ONLY:mode><V4SF_ONLY:mode><V4SF_ONLY:mode>"
+  [(set (match_operand:V16QI_ONLY 0 "register_operand" "=w")
+	(unspec:V16QI_ONLY
+	 [(match_operand:V8QI_ONLY 1 "register_operand" "w")
+	  (match_operand:V4SF_ONLY 2 "register_operand" "w")
+	  (match_operand:V4SF_ONLY 3 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_TERNARY_VCVT_UNS))]
+  "TARGET_FP8"
+  {
+    operands[1] = force_reg (V16QImode, operands[1]);
+    return "<fpm_uns_op>\t%1.16b, %2.<V4SF_ONLY:Vtype>, %3.<V4SF_ONLY:Vtype>";
+  }
+)
+
+;; fpm scale instructions
+(define_insn "@aarch64_<fpm_uns_op><VHSDF:mode><VHSDI:mode>"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
+		       (match_operand:VHSDI 2 "register_operand" "w")]
+		      FPM_SCALE_UNS))]
+  "TARGET_FP8"
+  "<fpm_uns_op>\t%0.<VHSDF:Vtype>, %1.<VHSDF:Vtype>, %2.<VHSDI:Vtype>"
+)
+
+;; fpm vdot2 instructions.
+(define_insn
+  "@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB:mode>"
+  [(set (match_operand:VHF 0 "register_operand" "=w")
+	(unspec:VHF
+	 [(match_operand:VHF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB 3 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT2_UNS))]
+  "TARGET_FP8DOT2"
+  "<fpm_uns_op>\t%1.<VHF:Vtype>, %2.<VB:Vtype>, %3.<VB:Vtype>"
+)
+
+;; fpm vdot2 instructions with lane.
+(define_insn
+  "@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB2:mode><SI_ONLY:mode>"
+  [(set (match_operand:VHF 0 "register_operand" "=w")
+	(unspec:VHF
+	 [(match_operand:VHF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB2 3 "register_operand" "w")
+	  (match_operand:SI_ONLY 4 "const_int_operand" "n")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT2_UNS))]
+  "TARGET_FP8DOT2"
+  "<fpm_uns_op>\t%1.<VHF:Vtype>, %2.<VB:Vtype>, %3.<VHF:Vdotlanetype>[%4]"
+)
+
+;; fpm vdot4 instructions.
+(define_insn
+  "@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB:mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB 3 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT4_UNS))]
+  "TARGET_FP8DOT4"
+  "<fpm_uns_op>\t%1.<VDQSF:Vtype>, %2.<VB:Vtype>, %3.<VB:Vtype>"
+)
+
+;; fpm vdot4 instructions with lane.
+(define_insn
+  "@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB2:mode><SI_ONLY:mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB2 3 "register_operand" "w")
+	  (match_operand:SI_ONLY 4 "const_int_operand" "n")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT4_UNS))]
+  "TARGET_FP8DOT4"
+  "<fpm_uns_op>\t%1.<VDQSF:Vtype>, %2.<VB:Vtype>, %3.<VDQSF:Vdotlanetype>[%4]"
+)
+
+;; fpm fma instructions.
+(define_insn
+  "@aarch64_<fpm_uns_op><VQ_HSF:mode><VQ_HSF:mode><V16QI_ONLY:mode><V16QI_ONLY:mode>"
+  [(set (match_operand:VQ_HSF 0 "register_operand" "=w")
+	(unspec:VQ_HSF
+	 [(match_operand:VQ_HSF 1 "register_operand" "w")
+	  (match_operand:V16QI_ONLY 2 "register_operand" "w")
+	  (match_operand:V16QI_ONLY 3 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_FMA_UNS))]
+  "TARGET_FP8FMA"
+  "<fpm_uns_op>\t%1.<VQ_HSF:Vtype>, %2.<V16QI_ONLY:Vtype>, %3.<V16QI_ONLY:Vtype>"
+)
+
+;; fpm fma instructions with lane.
+(define_insn
+  "@aarch64_<fpm_uns_op><VQ_HSF:mode><VQ_HSF:mode><V16QI_ONLY:mode><VB:mode><SI_ONLY:mode>"
+  [(set (match_operand:VQ_HSF 0 "register_operand" "=w")
+	(unspec:VQ_HSF
+	 [(match_operand:VQ_HSF 1 "register_operand" "w")
+	  (match_operand:V16QI_ONLY 2 "register_operand" "w")
+	  (match_operand:VB 3 "register_operand" "w")
+	  (match_operand:SI_ONLY 4 "const_int_operand" "n")
+	  (reg:DI FPM_REGNUM)]
+	FPM_FMA_UNS))]
+  "TARGET_FP8FMA"
+  "<fpm_uns_op>\t%1.<VQ_HSF:Vtype>, %2.<V16QI_ONLY:Vtype>, %3.b[%4]"
+)
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -494,6 +494,15 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
  ((TARGET_SVE2p1 || TARGET_STREAMING) \
   && (TARGET_SME2 || TARGET_NON_STREAMING))

+/* fp8 dot product instructions are enabled through +fp8dot2.  */
+#define TARGET_FP8DOT2 AARCH64_HAVE_ISA (FP8DOT2)
+
+/* fp8 dot product instructions are enabled through +fp8dot4.  */
+#define TARGET_FP8DOT4 AARCH64_HAVE_ISA (FP8DOT4)
+
+/* fp8 multiply-add instructions are enabled through +fp8fma.  */
+#define TARGET_FP8FMA AARCH64_HAVE_ISA (FP8FMA)
+
 /* Standard register usage.  */

 /* 31 64-bit general purpose registers R0-R30:
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -2490,104 +2490,6 @@ vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b)
  return (int32x4_t) __builtin_aarch64_sqrdmulhv4si (__a, __b);
 }

-__extension__ extern __inline int8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_s8 (uint64_t __a)
-{
-  return (int8x8_t) __a;
-}
-
-__extension__ extern __inline int16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_s16 (uint64_t __a)
-{
-  return (int16x4_t) __a;
-}
-
-__extension__ extern __inline int32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_s32 (uint64_t __a)
-{
-  return (int32x2_t) __a;
-}
-
-__extension__ extern __inline int64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_s64 (uint64_t __a)
-{
-  return (int64x1_t) {__a};
-}
-
-__extension__ extern __inline float16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_f16 (uint64_t __a)
-{
-  return (float16x4_t) __a;
-}
-
-__extension__ extern __inline float32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_f32 (uint64_t __a)
-{
-  return (float32x2_t) __a;
-}
-
-__extension__ extern __inline uint8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_u8 (uint64_t __a)
-{
-  return (uint8x8_t) __a;
-}
-
-__extension__ extern __inline uint16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_u16 (uint64_t __a)
-{
-  return (uint16x4_t) __a;
-}
-
-__extension__ extern __inline uint32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_u32 (uint64_t __a)
-{
-  return (uint32x2_t) __a;
-}
-
-__extension__ extern __inline uint64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_u64 (uint64_t __a)
-{
-  return (uint64x1_t) {__a};
-}
-
-__extension__ extern __inline float64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_f64 (uint64_t __a)
-{
-  return (float64x1_t) __a;
-}
-
-__extension__ extern __inline poly8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_p8 (uint64_t __a)
-{
-  return (poly8x8_t) __a;
-}
-
-__extension__ extern __inline poly16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_p16 (uint64_t __a)
-{
-  return (poly16x4_t) __a;
-}
-
-__extension__ extern __inline poly64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcreate_p64 (uint64_t __a)
-{
-  return (poly64x1_t) __a;
-}
-
 /* vget_lane  */

 __extension__ extern __inline float16_t
@@ -9245,14 +9147,14 @@ vcopy_lane_f64 (float64x1_t __a, const int __lane1,
 				  __a, __lane1);
 }

-__extension__ extern __inline poly8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcopy_lane_p8 (poly8x8_t __a, const int __lane1,
-	       poly8x8_t __b, const int __lane2)
-{
-  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-				 __a, __lane1);
-}
+/* __extension__ extern __inline poly8x8_t */
+/* __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) */
+/* vcopy_lane_p8 (poly8x8_t __a, const int __lane1, */
+/* 	       poly8x8_t __b, const int __lane2) */
+/* { */
+/*   return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), */
+/* 				 __a, __lane1); */
+/* } */

 __extension__ extern __inline poly16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -9308,14 +9210,14 @@ vcopy_lane_s64 (int64x1_t __a, const int __lane1,
 				  __a, __lane1);
 }

-__extension__ extern __inline uint8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcopy_lane_u8 (uint8x8_t __a, const int __lane1,
-	       uint8x8_t __b, const int __lane2)
-{
-  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-				 __a, __lane1);
-}
+/* __extension__ extern __inline uint8x8_t */
+/* __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) */
+/* vcopy_lane_u8 (uint8x8_t __a, const int __lane1, */
+/* 	       uint8x8_t __b, const int __lane2) */
+/* { */
+/*   return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), */
+/* 				 __a, __lane1); */
+/* } */

 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -10456,18 +10358,11 @@ vcvtpq_u64_f64 (float64x2_t __a)

 /* vdup_n  */

-__extension__ extern __inline float16x4_t
+__extension__ extern __inline poly64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_n_f16 (float16_t __a)
+vdup_n_p64 (poly64_t __a)
 {
-  return (float16x4_t) {__a, __a, __a, __a};
-}
-
-__extension__ extern __inline float32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_n_f32 (float32_t __a)
-{
-  return (float32x2_t) {__a, __a};
+  return (poly64x1_t) {__a};
 }

 __extension__ extern __inline float64x1_t
@@ -10477,48 +10372,6 @@ vdup_n_f64 (float64_t __a)
  return (float64x1_t) {__a};
 }

-__extension__ extern __inline poly8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_n_p8 (poly8_t __a)
-{
-  return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-}
-
-__extension__ extern __inline poly16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_n_p16 (poly16_t __a)
-{
-  return (poly16x4_t) {__a, __a, __a, __a};
-}
-
-__extension__ extern __inline poly64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_n_p64 (poly64_t __a)
-{
-  return (poly64x1_t) {__a};
-}
-
-__extension__ extern __inline int8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_n_s8 (int8_t __a)
-{
-  return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-}
-
-__extension__ extern __inline int16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_n_s16 (int16_t __a)
-{
-  return (int16x4_t) {__a, __a, __a, __a};
-}
-
-__extension__ extern __inline int32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_n_s32 (int32_t __a)
-{
-  return (int32x2_t) {__a, __a};
-}
-
 __extension__ extern __inline int64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_s64 (int64_t __a)
@@ -10526,27 +10379,6 @@ vdup_n_s64 (int64_t __a)
  return (int64x1_t) {__a};
 }

-__extension__ extern __inline uint8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_n_u8 (uint8_t __a)
-{
-  return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-}
-
-__extension__ extern __inline uint16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_n_u16 (uint16_t __a)
-{
-  return (uint16x4_t) {__a, __a, __a, __a};
-}
-
-__extension__ extern __inline uint32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_n_u32 (uint32_t __a)
-{
-  return (uint32x2_t) {__a, __a};
-}
-
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_u64 (uint64_t __a)
@@ -10554,125 +10386,8 @@ vdup_n_u64 (uint64_t __a)
  return (uint64x1_t) {__a};
 }

-/* vdupq_n  */
-
-__extension__ extern __inline float16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_f16 (float16_t __a)
-{
-  return (float16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-}
-
-__extension__ extern __inline float32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_f32 (float32_t __a)
-{
-  return (float32x4_t) {__a, __a, __a, __a};
-}
-
-__extension__ extern __inline float64x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_f64 (float64_t __a)
-{
-  return (float64x2_t) {__a, __a};
-}
-
-__extension__ extern __inline poly8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_p8 (poly8_t __a)
-{
-  return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
-		       __a, __a, __a, __a, __a, __a, __a, __a};
-}
-
-__extension__ extern __inline poly16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_p16 (poly16_t __a)
-{
-  return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-}
-
-__extension__ extern __inline poly64x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_p64 (poly64_t __a)
-{
-  return (poly64x2_t) {__a, __a};
-}
-
-__extension__ extern __inline int8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_s8 (int8_t __a)
-{
-  return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
-		      __a, __a, __a, __a, __a, __a, __a, __a};
-}
-
-__extension__ extern __inline int16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_s16 (int16_t __a)
-{
-  return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-}
-
-__extension__ extern __inline int32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_s32 (int32_t __a)
-{
-  return (int32x4_t) {__a, __a, __a, __a};
-}
-
-__extension__ extern __inline int64x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_s64 (int64_t __a)
-{
-  return (int64x2_t) {__a, __a};
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_u8 (uint8_t __a)
-{
-  return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
-		       __a, __a, __a, __a, __a, __a, __a, __a};
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_u16 (uint16_t __a)
-{
-  return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_u32 (uint32_t __a)
-{
-  return (uint32x4_t) {__a, __a, __a, __a};
-}
-
-__extension__ extern __inline uint64x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_u64 (uint64_t __a)
-{
-  return (uint64x2_t) {__a, __a};
-}
-
 /* vdup_lane  */

-__extension__ extern __inline float16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_lane_f16 (float16x4_t __a, const int __b)
-{
-  return __aarch64_vdup_lane_f16 (__a, __b);
-}
-
-__extension__ extern __inline float32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_lane_f32 (float32x2_t __a, const int __b)
-{
-  return __aarch64_vdup_lane_f32 (__a, __b);
-}
-
 __extension__ extern __inline float64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_f64 (float64x1_t __a, const int __b)
@@ -10680,20 +10395,6 @@ vdup_lane_f64 (float64x1_t __a, const int __b)
  return __aarch64_vdup_lane_f64 (__a, __b);
 }

-__extension__ extern __inline poly8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_lane_p8 (poly8x8_t __a, const int __b)
-{
-  return __aarch64_vdup_lane_p8 (__a, __b);
-}
-
-__extension__ extern __inline poly16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_lane_p16 (poly16x4_t __a, const int __b)
-{
-  return __aarch64_vdup_lane_p16 (__a, __b);
-}
-
 __extension__ extern __inline poly64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_p64 (poly64x1_t __a, const int __b)
@@ -10701,27 +10402,6 @@ vdup_lane_p64 (poly64x1_t __a, const int __b)
  return __aarch64_vdup_lane_p64 (__a, __b);
 }

-__extension__ extern __inline int8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_lane_s8 (int8x8_t __a, const int __b)
-{
-  return __aarch64_vdup_lane_s8 (__a, __b);
-}
-
-__extension__ extern __inline int16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_lane_s16 (int16x4_t __a, const int __b)
-{
-  return __aarch64_vdup_lane_s16 (__a, __b);
-}
-
-__extension__ extern __inline int32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_lane_s32 (int32x2_t __a, const int __b)
-{
-  return __aarch64_vdup_lane_s32 (__a, __b);
-}
-
 __extension__ extern __inline int64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_s64 (int64x1_t __a, const int __b)
@@ -10729,27 +10409,6 @@ vdup_lane_s64 (int64x1_t __a, const int __b)
  return __aarch64_vdup_lane_s64 (__a, __b);
 }

-__extension__ extern __inline uint8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_lane_u8 (uint8x8_t __a, const int __b)
-{
-  return __aarch64_vdup_lane_u8 (__a, __b);
-}
-
-__extension__ extern __inline uint16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_lane_u16 (uint16x4_t __a, const int __b)
-{
-  return __aarch64_vdup_lane_u16 (__a, __b);
-}
-
-__extension__ extern __inline uint32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdup_lane_u32 (uint32x2_t __a, const int __b)
-{
-  return __aarch64_vdup_lane_u32 (__a, __b);
-}
-
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_lane_u64 (uint64x1_t __a, const int __b)
@@ -11057,28 +10716,6 @@ vdupq_laneq_u64 (uint64x2_t __a, const int __b)
  return __aarch64_vdupq_laneq_u64 (__a, __b);
 }

-/* vdupb_lane  */
-__extension__ extern __inline poly8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupb_lane_p8 (poly8x8_t __a, const int __b)
-{
-  return __aarch64_vget_lane_any (__a, __b);
-}
-
-__extension__ extern __inline int8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupb_lane_s8 (int8x8_t __a, const int __b)
-{
-  return __aarch64_vget_lane_any (__a, __b);
-}
-
-__extension__ extern __inline uint8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupb_lane_u8 (uint8x8_t __a, const int __b)
-{
-  return __aarch64_vget_lane_any (__a, __b);
-}
-
 /* vduph_lane  */

 __extension__ extern __inline float16_t
@@ -11157,28 +10794,6 @@ vdupd_lane_u64 (uint64x1_t __a, const int __b)
  return __a[0];
 }

-/* vdupb_laneq  */
-__extension__ extern __inline poly8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupb_laneq_p8 (poly8x16_t __a, const int __b)
-{
-  return __aarch64_vget_lane_any (__a, __b);
-}
-
-__extension__ extern __inline int8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupb_laneq_s8 (int8x16_t __a, const int __b)
-{
-  return __aarch64_vget_lane_any (__a, __b);
-}
-
-__extension__ extern __inline uint8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupb_laneq_u8 (uint8x16_t __a, const int __b)
-{
-  return __aarch64_vget_lane_any (__a, __b);
-}
-
 /* vduph_laneq  */

 __extension__ extern __inline float16_t
@@ -11962,111 +11577,6 @@ vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
 				    __a);
 }

-/* vld1 */
-
-__extension__ extern __inline float16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_f16 (const float16_t *__a)
-{
-  return __builtin_aarch64_ld1v4hf (__a);
-}
-
-__extension__ extern __inline float32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_f32 (const float32_t *__a)
-{
-  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) __a);
-}
-
-__extension__ extern __inline float64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_f64 (const float64_t *__a)
-{
-  return (float64x1_t) {*__a};
-}
-
-__extension__ extern __inline poly8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_p8 (const poly8_t *__a)
-{
-  return __builtin_aarch64_ld1v8qi_ps (
-				(const __builtin_aarch64_simd_qi *) __a);
-}
-
-__extension__ extern __inline poly16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_p16 (const poly16_t *__a)
-{
-  return __builtin_aarch64_ld1v4hi_ps (
-				(const __builtin_aarch64_simd_hi *) __a);
-}
-
-__extension__ extern __inline poly64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_p64 (const poly64_t *__a)
-{
-  return (poly64x1_t) {*__a};
-}
-
-__extension__ extern __inline int8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_s8 (const int8_t *__a)
-{
-  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) __a);
-}
-
-__extension__ extern __inline int16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_s16 (const int16_t *__a)
-{
-  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) __a);
-}
-
-__extension__ extern __inline int32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_s32 (const int32_t *__a)
-{
-  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) __a);
-}
-
-__extension__ extern __inline int64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_s64 (const int64_t *__a)
-{
-  return (int64x1_t) {*__a};
-}
-
-__extension__ extern __inline uint8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_u8 (const uint8_t *__a)
-{
-  return __builtin_aarch64_ld1v8qi_us (
-				(const __builtin_aarch64_simd_qi *) __a);
-}
-
-__extension__ extern __inline uint16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_u16 (const uint16_t *__a)
-{
-  return __builtin_aarch64_ld1v4hi_us (
-				(const __builtin_aarch64_simd_hi *) __a);
-}
-
-__extension__ extern __inline uint32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_u32 (const uint32_t *__a)
-{
-  return __builtin_aarch64_ld1v2si_us (
-				(const __builtin_aarch64_simd_si *) __a);
-}
-
-__extension__ extern __inline uint64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_u64 (const uint64_t *__a)
-{
-  return (uint64x1_t) {*__a};
-}
-
 /* vld1x3  */

 __extension__ extern __inline uint8x8x3_t
@@ -12282,87 +11792,6 @@ vld1q_p64_x3 (const poly64_t *__a)

 /* vld1q */

-__extension__ extern __inline float16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_f16 (const float16_t *__a)
-{
-  return __builtin_aarch64_ld1v8hf (__a);
-}
-
-__extension__ extern __inline float32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_f32 (const float32_t *__a)
-{
-  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) __a);
-}
-
-__extension__ extern __inline float64x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_f64 (const float64_t *__a)
-{
-  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) __a);
-}
-
-__extension__ extern __inline poly8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_p8 (const poly8_t *__a)
-{
-  return __builtin_aarch64_ld1v16qi_ps (
-				(const __builtin_aarch64_simd_qi *) __a);
-}
-
-__extension__ extern __inline poly16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_p16 (const poly16_t *__a)
-{
-  return __builtin_aarch64_ld1v8hi_ps (
-				(const __builtin_aarch64_simd_hi *) __a);
-}
-
-__extension__ extern __inline poly64x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_p64 (const poly64_t *__a)
-{
-  return __builtin_aarch64_ld1v2di_ps (
-				(const __builtin_aarch64_simd_di *) __a);
-}
-
-__extension__ extern __inline int8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_s8 (const int8_t *__a)
-{
-  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) __a);
-}
-
-__extension__ extern __inline int16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_s16 (const int16_t *__a)
-{
-  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) __a);
-}
-
-__extension__ extern __inline int32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_s32 (const int32_t *__a)
-{
-  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) __a);
-}
-
-__extension__ extern __inline int64x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_s64 (const int64_t *__a)
-{
-  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) __a);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_u8 (const uint8_t *__a)
-{
-  return __builtin_aarch64_ld1v16qi_us (
-				(const __builtin_aarch64_simd_qi *) __a);
-}
-
 __extension__ extern __inline uint8x8x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_u8_x2 (const uint8_t *__a)
@@ -12574,30 +12003,6 @@ vld1q_p64_x2 (const poly64_t *__a)
 				(const __builtin_aarch64_simd_di *) __a);
 }

-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_u16 (const uint16_t *__a)
-{
-  return __builtin_aarch64_ld1v8hi_us (
-				(const __builtin_aarch64_simd_hi *) __a);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_u32 (const uint32_t *__a)
-{
-  return __builtin_aarch64_ld1v4si_us (
-				(const __builtin_aarch64_simd_si *) __a);
-}
-
-__extension__ extern __inline uint64x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_u64 (const uint64_t *__a)
-{
-  return __builtin_aarch64_ld1v2di_us (
-				(const __builtin_aarch64_simd_di *) __a);
-}
-
 /* vld1(q)_x4.  */

 __extension__ extern __inline int8x8x4_t
@@ -16709,18 +16114,11 @@ vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,

 /* vmov_n_  */

-__extension__ extern __inline float16x4_t
+__extension__ extern __inline poly64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmov_n_f16 (float16_t __a)
+vmov_n_p64 (poly64_t __a)
 {
-  return vdup_n_f16 (__a);
-}
-
-__extension__ extern __inline float32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmov_n_f32 (float32_t __a)
-{
-  return vdup_n_f32 (__a);
+  return (poly64x1_t) {__a};
 }

 __extension__ extern __inline float64x1_t
@@ -16730,48 +16128,6 @@ vmov_n_f64 (float64_t __a)
  return (float64x1_t) {__a};
 }

-__extension__ extern __inline poly8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmov_n_p8 (poly8_t __a)
-{
-  return vdup_n_p8 (__a);
-}
-
-__extension__ extern __inline poly16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmov_n_p16 (poly16_t __a)
-{
-  return vdup_n_p16 (__a);
-}
-
-__extension__ extern __inline poly64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmov_n_p64 (poly64_t __a)
-{
-  return vdup_n_p64 (__a);
-}
-
-__extension__ extern __inline int8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmov_n_s8 (int8_t __a)
-{
-  return vdup_n_s8 (__a);
-}
-
-__extension__ extern __inline int16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmov_n_s16 (int16_t __a)
-{
-  return vdup_n_s16 (__a);
-}
-
-__extension__ extern __inline int32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmov_n_s32 (int32_t __a)
-{
-  return vdup_n_s32 (__a);
-}
-
 __extension__ extern __inline int64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_s64 (int64_t __a)
@@ -16779,27 +16135,6 @@ vmov_n_s64 (int64_t __a)
  return (int64x1_t) {__a};
 }

-__extension__ extern __inline uint8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmov_n_u8 (uint8_t __a)
-{
-  return vdup_n_u8 (__a);
-}
-
-__extension__ extern __inline uint16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmov_n_u16 (uint16_t __a)
-{
-    return vdup_n_u16 (__a);
-}
-
-__extension__ extern __inline uint32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmov_n_u32 (uint32_t __a)
-{
-   return vdup_n_u32 (__a);
-}
-
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_u64 (uint64_t __a)
@@ -16807,104 +16142,6 @@ vmov_n_u64 (uint64_t __a)
  return (uint64x1_t) {__a};
 }

-__extension__ extern __inline float16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_f16 (float16_t __a)
-{
-  return vdupq_n_f16 (__a);
-}
-
-__extension__ extern __inline float32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_f32 (float32_t __a)
-{
-  return vdupq_n_f32 (__a);
-}
-
-__extension__ extern __inline float64x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_f64 (float64_t __a)
-{
-  return vdupq_n_f64 (__a);
-}
-
-__extension__ extern __inline poly8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_p8 (poly8_t __a)
-{
-  return vdupq_n_p8 (__a);
-}
-
-__extension__ extern __inline poly16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_p16 (poly16_t __a)
-{
-  return vdupq_n_p16 (__a);
-}
-
-__extension__ extern __inline poly64x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_p64 (poly64_t __a)
-{
-  return vdupq_n_p64 (__a);
-}
-
-__extension__ extern __inline int8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_s8 (int8_t __a)
-{
-  return vdupq_n_s8 (__a);
-}
-
-__extension__ extern __inline int16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_s16 (int16_t __a)
-{
-  return vdupq_n_s16 (__a);
-}
-
-__extension__ extern __inline int32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_s32 (int32_t __a)
-{
-  return vdupq_n_s32 (__a);
-}
-
-__extension__ extern __inline int64x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_s64 (int64_t __a)
-{
-  return vdupq_n_s64 (__a);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_u8 (uint8_t __a)
-{
-  return vdupq_n_u8 (__a);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_u16 (uint16_t __a)
-{
-  return vdupq_n_u16 (__a);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_u32 (uint32_t __a)
-{
-  return vdupq_n_u32 (__a);
-}
-
-__extension__ extern __inline uint64x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovq_n_u64 (uint64_t __a)
-{
-  return vdupq_n_u64 (__a);
-}
-
 /* vmul_lane  */

 __extension__ extern __inline float32x2_t
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -41,6 +41,9 @@
 ;; Iterators for single modes, for "@" patterns.
 (define_mode_iterator SI_ONLY [SI])
 (define_mode_iterator DI_ONLY [DI])
+(define_mode_iterator V8QI_ONLY [V8QI])
+(define_mode_iterator V16QI_ONLY [V16QI])
+(define_mode_iterator V4SF_ONLY [V4SF])

 ;; Iterator for all integer modes (up to 64-bit)
 (define_mode_iterator ALLI [QI HI SI DI])
@@ -160,9 +163,19 @@

 ;; Advanced SIMD Float modes.
 (define_mode_iterator VDQF [V2SF V4SF V2DF])
+
+(define_mode_iterator VHF [(V4HF "TARGET_SIMD_F16INST")
+			   (V8HF "TARGET_SIMD_F16INST")])
+
 (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
 			     (V8HF "TARGET_SIMD_F16INST")
 			     V2SF V4SF V2DF])
+(define_mode_iterator VH_SF [(V4HF "TARGET_SIMD_F16INST")
+			     (V8HF "TARGET_SIMD_F16INST")
+			     V4SF])
+
+;; Advanced SIMD Integer modes.
+(define_mode_iterator VHSDI [V4HI V8HI V2SI V4SI V2DI])

 ;; Advanced SIMD Float modes, and DF.
 (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF])
@@ -312,6 +325,7 @@

 ;; All byte modes.
 (define_mode_iterator VB [V8QI V16QI])
+(define_mode_iterator VB2 [VB])

 ;; 1 and 2 lane DI and DF modes.
 (define_mode_iterator V12DIF [V1DI V1DF V2DI V2DF])
@@ -426,6 +440,12 @@
 			     (V8HF "TARGET_SIMD_F16INST")
 			     V2SF V4SF])

+;; Modes available for Advanced SIMD FP8 conversion operations.
+(define_mode_iterator VCVTFPM [V8QI
+			       (V4HF "TARGET_SIMD_F16INST")
+			       (V8HF "TARGET_SIMD_F16INST")
+			       V4SF])
+
 ;; Iterators for single modes, for "@" patterns.
 (define_mode_iterator VNx16QI_ONLY [VNx16QI])
 (define_mode_iterator VNx16SI_ONLY [VNx16SI])
@@ -635,6 +655,10 @@
 ;; Bfloat16 modes to which V4SF can be converted
 (define_mode_iterator V4SF_TO_BF [V4BF V8BF])

+;; Float16 and Bfloat16 modes separately
+(define_mode_iterator V8HF_ONLY [V8HF])
+(define_mode_iterator V8BF_ONLY [V8BF])
+
 (define_mode_iterator SVE_BHSx24 [VNx32QI VNx16HI VNx8SI
 				  VNx16BF VNx16HF VNx8SF
 				  VNx64QI VNx32HI VNx16SI
@@ -691,6 +715,9 @@
    UNSPEC_ASHIFT_SIGNED	; Used in aarch-simd.md.
    UNSPEC_ASHIFT_UNSIGNED	; Used in aarch64-simd.md.
    UNSPEC_ABS		; Used in aarch64-simd.md.
+    UNSPEC_DUP		; Used in aarch64-simd.md.
+    UNSPEC_DUPB		; Used in aarch64-simd.md.
+    UNSPEC_DUP_LANE	; Used in aarch64-simd.md.
    UNSPEC_FMAX		; Used in aarch64-simd.md.
    UNSPEC_FMAXNMV	; Used in aarch64-simd.md.
    UNSPEC_FMAXV	; Used in aarch64-simd.md.
@@ -698,7 +725,12 @@
    UNSPEC_FMINNMV	; Used in aarch64-simd.md.
    UNSPEC_FMINV	; Used in aarch64-simd.md.
    UNSPEC_FADDV	; Used in aarch64-simd.md.
+    UNSPEC_FMLALLBB	; Used in aarch64-simd.md.
+    UNSPEC_FMLALLBT	; Used in aarch64-simd.md.
+    UNSPEC_FMLALLTB	; Used in aarch64-simd.md.
+    UNSPEC_FMLALLTT	; Used in aarch64-simd.md.
    UNSPEC_FNEG		; Used in aarch64-simd.md.
+    UNSPEC_FSCALE	; Used in aarch64-simd.md.
    UNSPEC_ADDV		; Used in aarch64-simd.md.
    UNSPEC_SMAXV	; Used in aarch64-simd.md.
    UNSPEC_SMINV	; Used in aarch64-simd.md.
@@ -736,6 +768,17 @@
    UNSPEC_SSHLL	; Used in aarch64-simd.md.
    UNSPEC_USHLL	; Used in aarch64-simd.md.
    UNSPEC_ADDP		; Used in aarch64-simd.md.
+    UNSPEC_VCREATE	; Used in aarch64-simd.md.
+    UNSPEC_VCVT		; Used in aarch64-simd.md.
+    UNSPEC_VCVT_HIGH	; Used in aarch64-simd.md.
+    UNSPEC_VCVT1	; Used in aarch64-simd.md.
+    UNSPEC_VCVT1_HIGH	; Used in aarch64-simd.md.
+    UNSPEC_VCVT1_LOW	; Used in aarch64-simd.md.
+    UNSPEC_VCVT2	; Used in aarch64-simd.md.
+    UNSPEC_VCVT2_HIGH	; Used in aarch64-simd.md.
+    UNSPEC_VCVT2_LOW	; Used in aarch64-simd.md.
+    UNSPEC_VDOT2 	; Used in aarch64-simd.md.
+    UNSPEC_VDOT4	; Used in aarch64-simd.md.
    UNSPEC_TBL		; Used in vector permute patterns.
    UNSPEC_TBLQ		; Used in vector permute patterns.
    UNSPEC_TBX		; Used in vector permute patterns.
@@ -773,6 +816,7 @@
    UNSPEC_PMULL        ; Used in aarch64-simd.md.
    UNSPEC_PMULL2       ; Used in aarch64-simd.md.
    UNSPEC_REV_REGLIST  ; Used in aarch64-simd.md.
+    UNSPEC_VEC_COPY     ; Used in aarch64-simd.md.
    UNSPEC_VEC_SHR      ; Used in aarch64-simd.md.
    UNSPEC_SQRDMLAH     ; Used in aarch64-simd.md.
    UNSPEC_SQRDMLSH     ; Used in aarch64-simd.md.
@@ -2463,6 +2507,11 @@
 			    (VNx8HF ".h") (VNx16HF "") (VNx32HF "")
 			    (VNx8HI ".h") (VNx16HI "") (VNx32HI "")])

+
+;; Lane index suffix for fp8 vdot operations depends on the output mode
+(define_mode_attr Vdotlanetype [(V4HF "2b") (V8HF "2b")
+				(V2SF "4b") (V4SF "4b")])
+
 ;; The number of bytes controlled by a predicate
 (define_mode_attr data_bytes [(VNx16BI "1") (VNx8BI "2")
 			      (VNx4BI "4") (VNx2BI "8")])
@@ -4659,3 +4708,59 @@

 (define_code_attr faminmax_op
  [(smax "famax") (smin "famin")])
+
+;; Iterators and attributes for fpm instructions
+
+(define_int_iterator FPM_UNARY_UNS
+  [UNSPEC_VCVT1
+   UNSPEC_VCVT1_HIGH
+   UNSPEC_VCVT2
+   UNSPEC_VCVT2_HIGH])
+
+(define_int_iterator FPM_UNARY_LOW_UNS [UNSPEC_VCVT1_LOW UNSPEC_VCVT2_LOW])
+
+(define_int_iterator FPM_BINARY_UNS [UNSPEC_VCVT])
+
+(define_int_iterator FPM_SCALE_UNS [UNSPEC_FSCALE])
+
+(define_int_iterator FPM_TERNARY_VCVT_UNS [UNSPEC_VCVT_HIGH])
+
+(define_int_attr fpm_unary_bf_uns_op
+  [(UNSPEC_VCVT1 "bf1cvtl")
+   (UNSPEC_VCVT1_HIGH "bf1cvtl2")
+   (UNSPEC_VCVT1_LOW "bf1cvtl")
+   (UNSPEC_VCVT2 "bf2cvtl")
+   (UNSPEC_VCVT2_HIGH "bf2cvtl2")
+   (UNSPEC_VCVT2_LOW "bf2cvtl")])
+
+(define_int_attr fpm_unary_hf_uns_op
+  [(UNSPEC_VCVT1 "f1cvtl")
+   (UNSPEC_VCVT1_HIGH "f1cvtl2")
+   (UNSPEC_VCVT1_LOW "f1cvtl")
+   (UNSPEC_VCVT2 "f2cvtl")
+   (UNSPEC_VCVT2_HIGH "f2cvtl2")
+   (UNSPEC_VCVT2_LOW "f2cvtl")])
+
+(define_int_iterator FPM_VDOT2_UNS [UNSPEC_VDOT2])
+(define_int_iterator FPM_VDOT4_UNS [UNSPEC_VDOT4])
+
+(define_int_iterator FPM_FMA_UNS
+  [UNSPEC_FMLALB
+   UNSPEC_FMLALT
+   UNSPEC_FMLALLBB
+   UNSPEC_FMLALLBT
+   UNSPEC_FMLALLTB
+   UNSPEC_FMLALLTT])
+
+(define_int_attr fpm_uns_op
+  [(UNSPEC_FSCALE "fscale")
+   (UNSPEC_VCVT "fcvtn")
+   (UNSPEC_VCVT_HIGH "fcvtn2")
+   (UNSPEC_FMLALB "fmlalb")
+   (UNSPEC_FMLALT "fmlalt")
+   (UNSPEC_FMLALLBB "fmlallbb")
+   (UNSPEC_FMLALLBT "fmlallbt")
+   (UNSPEC_FMLALLTB "fmlalltb")
+   (UNSPEC_FMLALLTT "fmlalltt")
+   (UNSPEC_VDOT2 "fdot")
+   (UNSPEC_VDOT4 "fdot")])
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -21807,6 +21807,12 @@ Enable support for Armv8.9-a/9.4-a translation hardening extension.
 Enable the RCpc3 (Release Consistency) extension.
@item fp8
 Enable the fp8 (8-bit floating point) extension.
+@item fp8dot2
+Enable the fp8dot2 (8-bit floating point dot product) extension.
+@item fp8dot4
+Enable the fp8dot4 (8-bit floating point dot product) extension.
+@item fp8fma
+Enable the fp8fma (8-bit floating point multiply-add) extension.
@item faminmax
 Enable the Floating Point Absolute Maximum/Minimum extension.

--- a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
+++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
@@ -5,19 +5,9 @@

 #include <arm_acle.h>

-#ifdef __ARM_FEATURE_FP8
-#error "__ARM_FEATURE_FP8 feature macro defined."
-#endif
-
 #pragma GCC push_options
 #pragma GCC target("arch=armv9.4-a+fp8")

-/* We do not define __ARM_FEATURE_FP8 until all
-   relevant features have been added. */
-#ifdef __ARM_FEATURE_FP8
-#error "__ARM_FEATURE_FP8 feature macro defined."
-#endif
-
 /*
 **test_write_fpmr_sysreg_asm_64:
 **	msr	fpmr, x0
--- a/gcc/testsuite/gcc.target/aarch64/simd/fma_fpm.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fma_fpm.c
@@ -0,0 +1,221 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8fma" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vmlalbq_f16_fpm:
+**	msr	fpmr, x0
+**	fmlalb	v0.8h, v1.16b, v2.16b
+**	ret
+*/
+float16x8_t
+test_vmlalbq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vmlalbq_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vmlaltq_f16_fpm:
+**	msr	fpmr, x0
+**	fmlalt	v0.8h, v1.16b, v2.16b
+**	ret
+*/
+float16x8_t
+test_vmlaltq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vmlaltq_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vmlallbbq_f32_fpm:
+**	msr	fpmr, x0
+**	fmlallbb	v0.4s, v1.16b, v2.16b
+**	ret
+*/
+float32x4_t
+test_vmlallbbq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vmlallbbq_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vmlallbtq_f32_fpm:
+**	msr	fpmr, x0
+**	fmlallbt	v0.4s, v1.16b, v2.16b
+**	ret
+*/
+float32x4_t
+test_vmlallbtq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vmlallbtq_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vmlalltbq_f32_fpm:
+**	msr	fpmr, x0
+**	fmlalltb	v0.4s, v1.16b, v2.16b
+**	ret
+*/
+float32x4_t
+test_vmlalltbq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vmlalltbq_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vmlallttq_f32_fpm:
+**	msr	fpmr, x0
+**	fmlalltt	v0.4s, v1.16b, v2.16b
+**	ret
+*/
+float32x4_t
+test_vmlallttq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vmlallttq_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vmlalbq_lane_f16_fpm:
+**	msr	fpmr, x0
+**	fmlalb	v0.8h, v1.16b, v2.b\[1\]
+**	ret
+*/
+float16x8_t
+test_vmlalbq_lane_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vmlalbq_lane_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vmlalbq_laneq_f16_fpm:
+**	msr	fpmr, x0
+**	fmlalb	v0.8h, v1.16b, v2.b\[1\]
+**	ret
+*/
+float16x8_t
+test_vmlalbq_laneq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vmlalbq_laneq_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vmlaltq_lane_f16_fpm:
+**	msr	fpmr, x0
+**	fmlalt	v0.8h, v1.16b, v2.b\[1\]
+**	ret
+*/
+float16x8_t
+test_vmlaltq_lane_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vmlaltq_lane_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vmlaltq_laneq_f16_fpm:
+**	msr	fpmr, x0
+**	fmlalt	v0.8h, v1.16b, v2.b\[1\]
+**	ret
+*/
+float16x8_t
+test_vmlaltq_laneq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vmlaltq_laneq_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vmlallbbq_lane_f32_fpm:
+**	msr	fpmr, x0
+**	fmlallbb	v0.4s, v1.16b, v2.b\[1\]
+**	ret
+*/
+float32x4_t
+test_vmlallbbq_lane_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vmlallbbq_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vmlallbbq_laneq_f32_fpm:
+**	msr	fpmr, x0
+**	fmlallbb	v0.4s, v1.16b, v2.b\[1\]
+**	ret
+*/
+float32x4_t
+test_vmlallbbq_laneq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vmlallbbq_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vmlallbtq_lane_f32_fpm:
+**	msr	fpmr, x0
+**	fmlallbt	v0.4s, v1.16b, v2.b\[1\]
+**	ret
+*/
+float32x4_t
+test_vmlallbtq_lane_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vmlallbtq_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vmlallbtq_laneq_f32_fpm:
+**	msr	fpmr, x0
+**	fmlallbt	v0.4s, v1.16b, v2.b\[1\]
+**	ret
+*/
+float32x4_t
+test_vmlallbtq_laneq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vmlallbtq_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vmlalltbq_lane_f32_fpm:
+**	msr	fpmr, x0
+**	fmlalltb	v0.4s, v1.16b, v2.b\[1\]
+**	ret
+*/
+float32x4_t
+test_vmlalltbq_lane_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vmlalltbq_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vmlalltbq_laneq_f32_fpm:
+**	msr	fpmr, x0
+**	fmlalltb	v0.4s, v1.16b, v2.b\[1\]
+**	ret
+*/
+float32x4_t
+test_vmlalltbq_laneq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vmlalltbq_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vmlallttq_lane_f32_fpm:
+**	msr	fpmr, x0
+**	fmlalltt	v0.4s, v1.16b, v2.b\[1\]
+**	ret
+*/
+float32x4_t
+test_vmlallttq_lane_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vmlallttq_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vmlallttq_laneq_f32_fpm:
+**	msr	fpmr, x0
+**	fmlalltt	v0.4s, v1.16b, v2.b\[1\]
+**	ret
+*/
+float32x4_t
+test_vmlallttq_laneq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vmlallttq_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}
--- a/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vscale_f16:
+**	fscale	v0.4h, v0.4h, v1.4h
+**	ret
+*/
+float16x4_t
+test_vscale_f16 (float16x4_t a, int16x4_t b)
+{
+  return vscale_f16 (a, b);
+}
+
+/*
+** test_vscaleq_f16:
+**	fscale	v0.8h, v0.8h, v1.8h
+**	ret
+*/
+float16x8_t
+test_vscaleq_f16 (float16x8_t a, int16x8_t b)
+{
+  return vscaleq_f16 (a, b);
+}
+
+/*
+** test_vscale_f32:
+**	fscale	v0.2s, v0.2s, v1.2s
+**	ret
+*/
+float32x2_t
+test_vscale_f32 (float32x2_t a, int32x2_t b)
+{
+  return vscale_f32 (a, b);
+}
+
+/*
+** test_vscaleq_f32:
+**	fscale	v0.4s, v0.4s, v1.4s
+**	ret
+*/
+float32x4_t
+test_vscaleq_f32 (float32x4_t a, int32x4_t b)
+{
+  return vscaleq_f32 (a, b);
+}
+
+/*
+** test_vscaleq_f64:
+**	fscale	v0.2d, v0.2d, v1.2d
+**	ret
+*/
+float64x2_t
+test_vscaleq_f64 (float64x2_t a, int64x2_t b)
+{
+  return vscaleq_f64 (a, b);
+}
--- a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
@@ -0,0 +1,197 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vcvt1_bf16:
+**	msr	fpmr, x0
+**	bf1cvtl	v0.8h, v0.8b
+**	ret
+*/
+bfloat16x8_t
+test_vcvt1_bf16 (mfloat8x8_t a, fpm_t b)
+{
+  return vcvt1_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt1_bf16:
+**	msr	fpmr, x0
+**	bf1cvtl2	v0.8h, v0.16b
+**	ret
+*/
+bfloat16x8_t
+test_high_vcvt1_bf16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_high_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt1_bf16:
+**	msr	fpmr, x0
+**	bf1cvtl	v0.8h, v0.8b
+**	ret
+*/
+bfloat16x8_t
+test_low_vcvt1_bf16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_low_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt1_f16:
+**	msr	fpmr, x0
+**	f1cvtl	v0.8h, v0.8b
+**	ret
+*/
+float16x8_t
+test_vcvt1_f16 (mfloat8x8_t a, fpm_t b)
+{
+  return vcvt1_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt1_f16:
+**	msr	fpmr, x0
+**	f1cvtl2	v0.8h, v0.16b
+**	ret
+*/
+float16x8_t
+test_high_vcvt1_f16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_high_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt1_f16:
+**	msr	fpmr, x0
+**	f1cvtl	v0.8h, v0.8b
+**	ret
+*/
+float16x8_t
+test_low_vcvt1_f16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_low_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt2_bf16:
+**	msr	fpmr, x0
+**	bf2cvtl	v0.8h, v0.8b
+**	ret
+*/
+bfloat16x8_t
+test_vcvt2_bf16 (mfloat8x8_t a, fpm_t b)
+{
+  return vcvt2_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt2_bf16:
+**	msr	fpmr, x0
+**	bf2cvtl2	v0.8h, v0.16b
+**	ret
+*/
+bfloat16x8_t
+test_high_vcvt2_bf16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt2_high_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt2_bf16:
+**	msr	fpmr, x0
+**	bf1cvtl	v0.8h, v0.8b
+**	ret
+*/
+bfloat16x8_t
+test_low_vcvt2_bf16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_low_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt2_f16:
+**	msr	fpmr, x0
+**	f2cvtl	v0.8h, v0.8b
+**	ret
+*/
+float16x8_t
+test_vcvt2_f16 (mfloat8x8_t a, fpm_t b)
+{
+  return vcvt2_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt2_f16:
+**	msr	fpmr, x0
+**	f2cvtl2	v0.8h, v0.16b
+**	ret
+*/
+float16x8_t
+test_high_vcvt2_f16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt2_high_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt2_f16:
+**	msr	fpmr, x0
+**	f1cvtl	v0.8h, v0.8b
+**	ret
+*/
+float16x8_t
+test_low_vcvt2_f16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_low_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt_f16:
+**	msr	fpmr, x0
+**	fcvtn	v0.8b, v0.4h, v1.4h
+**	ret
+*/
+mfloat8x8_t
+test_vcvt_f16 (float16x4_t a, float16x4_t b, fpm_t c)
+{
+  return vcvt_mf8_f16_fpm(a, b, c);
+}
+
+/*
+** test_vcvtq_f16:
+**	msr	fpmr, x0
+**	fcvtn	v0.16b, v0.8h, v1.8h
+**	ret
+*/
+mfloat8x16_t
+test_vcvtq_f16 (float16x8_t a, float16x8_t b, fpm_t c)
+{
+  return vcvtq_mf8_f16_fpm(a, b, c);
+}
+
+/*
+** test_vcvt_f32:
+**	msr	fpmr, x0
+**	fcvtn	v0.8b, v0.4s, v1.4s
+**	ret
+*/
+mfloat8x8_t
+test_vcvt_f32 (float32x4_t a, float32x4_t b, fpm_t c)
+{
+  return vcvt_mf8_f32_fpm(a, b, c);
+}
+
+/*
+** test_vcvt_high_f32:
+**	msr	fpmr, x0
+**	fcvtn2	v0.16b, v1.4s, v2.4s
+**	ret
+*/
+mfloat8x16_t
+test_vcvt_high_f32 (mfloat8x8_t a, float32x4_t b, float32x4_t c, fpm_t d)
+{
+  return vcvt_high_mf8_f32_fpm(a, b, c, d);
+}
--- a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vdot_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4h, v1.8b, v2.8b
+**	ret
+*/
+float16x4_t
+test_vdot_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdotq_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.8h, v1.16b, v2.16b
+**	ret
+*/
+float16x8_t
+test_vdotq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdot_lane_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4h, v1.8b, v2.2b\[1\]
+**	ret
+*/
+float16x4_t
+test_vdot_lane_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_lane_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdot_laneq_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4h, v1.8b, v2.2b\[1\]
+**	ret
+*/
+float16x4_t
+test_vdot_laneq_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdot_laneq_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_lane_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.8h, v1.16b, v2.2b\[1\]
+**	ret
+*/
+float16x8_t
+test_vdotq_lane_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdotq_lane_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_laneq_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.8h, v1.16b, v2.2b\[1\]
+**	ret
+*/
+float16x8_t
+test_vdotq_laneq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_laneq_f16_mf8_fpm (a, b, c, 1, d);
+}
--- a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vdot_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.2s, v1.8b, v2.8b
+**	ret
+*/
+float32x2_t
+test_vdot_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdotq_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4s, v1.16b, v2.16b
+**	ret
+*/
+float32x4_t
+test_vdotq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdot_lane_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.2s, v1.8b, v2.4b\[1\]
+**	ret
+*/
+float32x2_t
+test_vdot_lane_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdot_laneq_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.2s, v1.8b, v2.4b\[1\]
+**	ret
+*/
+float32x2_t
+test_vdot_laneq_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdot_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_lane_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4s, v1.16b, v2.4b\[1\]
+**	ret
+*/
+float32x4_t
+test_vdotq_lane_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdotq_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_laneq_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4s, v1.16b, v2.4b\[1\]
+**	ret
+*/
+float32x4_t
+test_vdotq_laneq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}
Author	SHA1	Message	Date
Saurabh Jha	d54a66c1d8	Work in progress for refactoring simd intrinsic	2024-11-20 10:15:53 +00:00
Saurabh Jha	8e45a01d0f	aarch64: Add support for fp8fma instructions The AArch64 FEAT_FP8FMA extension introduces instructions for multiply-add of vectors. This patch introduces the following instructions: 1. {vmlalbq\|vmlaltq}_f16_mf8_fpm. 2. {vmlalbq\|vmlaltq}_lane{q}_f16_mf8_fpm. 3. {vmlallbbq\|vmlallbtq\|vmlalltbq\|vmlallttq}_f32_mf8_fpm. 4. {vmlallbbq\|vmlallbtq\|vmlalltbq\|vmlallttq}_lane{q}_f32_mf8_fpm. It introduces the fp8fma flag. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (check_simd_lane_bounds): Add support for new unspecs. (aarch64_expand_pragma_builtins): Add support for new unspecs. * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): New flags. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): New flags. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_FMA_FPM): Macro to declare fma intrinsics. (REQUIRED_EXTENSIONS): Define to declare functions behind command line flags. * config/aarch64/aarch64-simd.md: (@aarch64_<fpm_uns_op><VQ_HSF:mode><VQ_HSF:mode><V16QI_ONLY:mode><V16QI_ONLY:mode): Instruction pattern for fma intrinsics. (@aarch64_<fpm_uns_op><VQ_HSF:mode><VQ_HSF:mode><V16QI_ONLY:mode><VB:mode><SI_ONLY:mode): Instruction pattern for fma intrinsics with lane. * config/aarch64/aarch64.h (TARGET_FP8FMA): New flag for fp8fma instructions. * config/aarch64/iterators.md: New attributes and iterators. * doc/invoke.texi: New flag for fp8fma instructions. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/fma_fpm.c: New test.	2024-11-14 09:59:41 +00:00
Saurabh Jha	ee10846d02	aarch64: Add support for fp8dot2 and fp8dot4 The AArch64 FEAT_FP8DOT2 and FEAT_FP8DOT4 extension introduces instructions for dot product of vectors. This patch introduces the following intrinsics: 1. vdot{q}_{fp16\|fp32}_mf8_fpm. 2. vdot{q}_lane{q}_{fp16\|fp32}_mf8_fpm. It introduces two flags: fp8dot2 and fp8dot4. We had to add space for another type in aarch64_pragma_builtins_data struct. The macros were updated to reflect that. We added a new aarch64_builtin_signature variant, quaternary, and added support for it in the functions aarch64_fntype and aarch64_expand_pragma_builtin. We added a new namespace, function_checker, to implement range checks for functions defined using the new pragma approach. The old intrinsic range checks will continue to work. All the new AdvSIMD intrinsics we define that need lane checks should be using the function in this namespace to implement the checks. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Change to handle extra type. (enum class): Added new variant. (struct aarch64_pragma_builtins_data): Add support for another type. (aarch64_get_number_of_args): Handle new signature. (require_integer_constant): New function to check whether the operand is an integer constant. (require_immediate_range): New function to validate index ranges. (check_simd_lane_bounds): New function to validate index operands. (aarch64_general_check_builtin_call): Call function_checker::check-simd_lane_bounds. (aarch64_expand_pragma_builtin): Handle new signature. * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): New flags. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): New flags. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_BINARY): Change to handle extra type. (ENTRY_BINARY_FPM): Change to handle extra type. (ENTRY_UNARY_FPM): Change to handle extra type. (ENTRY_TERNARY_FPM_LANE): Macro to declare fpm ternary with lane intrinsics. (ENTRY_VDOT_FPM): Macro to declare vdot intrinsics. (REQUIRED_EXTENSIONS): Define to declare functions behind command line flags. * config/aarch64/aarch64-simd.md: (@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB:mode>): Instruction pattern for vdot2 intrinsics. (@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB2:mode><SI_ONLY:mode>): Instruction pattern for vdot2 intrinsics with lane. (@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB:mode>): Instruction pattern for vdot4 intrinsics. (@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB2:mode><SI_ONLY:mode>): Instruction pattern for vdo4 intrinsics with lane. * config/aarch64/aarch64.h (TARGET_FP8DOT2): New flag for fp8dot2 instructions. (TARGET_FP8DOT4): New flag for fp8dot4 instructions. * config/aarch64/iterators.md: New attributes and iterators. * doc/invoke.texi: New flag for fp8dot2 and fp8dot4 instructions. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/vdot2_fpmdot.c: New test. * gcc.target/aarch64/simd/vdot4_fpmdot.c: New test.	2024-11-14 09:59:41 +00:00
Saurabh Jha	3103441079	aarch64: Add support for fp8 convert and scale The AArch64 FEAT_FP8 extension introduces instructions for conversion and scaling. This patch introduces the following intrinsics: 1. vcvt{1\|2}_{bf16\|high_bf16\|low_bf16}_mf8_fpm. 2. vcvt{q}_mf8_f16_fpm. 3. vcvt_{high}_mf8_f32_fpm. 4. vscale{q}_{f16\|f32\|f64}. We introduced two aarch64_builtin_signatures enum variants, unary and ternary, and added support for these variants in the functions aarch64_fntype and aarch64_expand_pragma_builtin. We added new simd_types for integers (s32, s32q, and s64q) and for floating points (f8 and f8q). Because we added support for fp8 intrinsics here, we modified the check in acle/fp8.c that was checking that __ARM_FEATURE_FP8 macro is not defined. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Modified to support uses_fpmr flag. (enum class): New variants to support new signatures. (struct aarch64_pragma_builtins_data): Add a new boolean field, uses_fpmr. (aarch64_get_number_of_args): Helper function used in aarch64_fntype and aarch64_expand_pragma_builtin. (aarch64_fntype): Handle new signatures. (aarch64_expand_pragma_builtin): Handle new signatures. * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): New flag for FP8. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_BINARY): Macro to declare binary intrinsics. (ENTRY_TERNARY): Macro to declare ternary intrinsics. (ENTRY_UNARY): Macro to declare unary intrinsics. (ENTRY_VHSDF): Macro to declare binary intrinsics. (ENTRY_VHSDF_VHSDI): Macro to declare binary intrinsics. (REQUIRED_EXTENSIONS): Define to declare functions behind command line flags. * config/aarch64/aarch64-simd.md (@aarch64_<fpm_unary_bf_uns_op><V8BF_ONLY:mode><VB:mode>): Unary pattern. (@aarch64_<fpm_unary_hf_uns_op><V8HF_ONLY:mode><VB:mode>): Unary pattern. (@aarch64_lower_<fpm_unary_bf_uns_op><V8BF_ONLY:mode><V16QI_ONLY:mode>): Unary pattern. (@aarch64_lower_<fpm_unary_hf_uns_op><V8HF_ONLY:mode><V16QI_ONLY:mode>): Unary pattern. (@aarch64<fpm_uns_op><VB:mode><VCVTFPM:mode><VH_SF:mode>): Binary pattern. (@aarch64_<fpm_uns_op><V16QI_ONLY:mode><V8QI_ONLY:mode><V4SF_ONLY:mode><V4SF_ONLY:mode>): Unary pattern. (@aarch64_<fpm_uns_op><VHSDF:mode><VHSDI:mode>): Binary pattern. * config/aarch64/iterators.md: New attributes and iterators. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/fp8.c: Remove check that fp8 feature macro doesn't exist. * gcc.target/aarch64/simd/scale_fpm.c: New test. * gcc.target/aarch64/simd/vcvt_fpm.c: New test.	2024-11-14 09:59:31 +00:00
Vladimir Miloserdov	1b6b028e27	aarch64: Refactor infrastructure for advsimd intrinsics This patch refactors the infrastructure for defining advsimd pragma intrinsics, adding support for more flexible type and signature handling in future SIMD extensions. A new simd_type structure is introduced, which allows for consistent mode and qualifier management across various advsimd operations. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Modify to include modes and qualifiers for simd_type structure. (ENTRY_VHSDF): Move to aarch64-builtins.cc to decouple. (struct simd_type): New structure for managing mode and qualifier combinations for SIMD types. (struct aarch64_pragma_builtins_data): Replace mode with simd_type to support multiple argument types for intrinsics. (aarch64_fntype): Modify to handle different shapes type. (aarch64_expand_pragma_builtin): Modify to handle different shapes type. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_BINARY): Move from aarch64-builtins.cc. (ENTRY_VHSDF): Move from aarch64-builtins.cc. (REQUIRED_EXTENSIONS): New macro.	2024-11-14 06:34:13 +00:00