[v2] bpf: add inline memmove and memcpy expansion

Message ID 20240220194631.20557-1-david.faust@oracle.com
State New
Headers
Series [v2] bpf: add inline memmove and memcpy expansion |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Testing passed

Commit Message

David Faust Feb. 20, 2024, 7:46 p.m. UTC
  [Changes from v1: Jose's review comments, all of which I agree with.
 - Fix 'implments' typo in commit message.
 - Change check that alignment is CONST_INT to gcc_assert ().
 - Change default case in alignment switch to gcc_unreachable ().
 - Reword error message for non-constant size memmove/memcpy, and
   update test for the error accordingly.
 - Delete CPYMEM_EXPAND_ERR macro, since it was now only used in
   one place.  ]

BPF programs are not typically linked, which means we cannot fall back
on library calls to implement __builtin_{memmove,memcpy} and should
always expand them inline if possible.

GCC already successfully expands these builtins inline in many cases,
but failed to do so for a few for simple cases involving overlapping
memmove in the kernel BPF selftests and was instead emitting a libcall.

This patch implements a simple inline expansion of memcpy and memmove in
the BPF backend in a verifier-friendly way, with the caveat that the
size must be an integer constant, which is also required by clang.

Tested for bpf-unknown-none on x86_64-linux-gnu host.

Also tested against the BPF verifier by compiling and loading a test
program with overlapping memmove (essentially the memmove-1.c test)
which failed before due to a libcall, and now successfully loads and
passes the verifier.

gcc/

	* config/bpf/bpf-protos.h (bpf_expand_cpymem): New.
	* config/bpf/bpf.cc: (emit_move_loop, bpf_expand_cpymem): New.
	* config/bpf/bpf.md: (cpymemdi, movmemdi): New define_expands.

gcc/testsuite/

	* gcc.target/bpf/memcpy-1.c: New test.
	* gcc.target/bpf/memmove-1.c: New test.
	* gcc.target/bpf/memmove-2.c: New test.
---
 gcc/config/bpf/bpf-protos.h              |   2 +
 gcc/config/bpf/bpf.cc                    | 116 +++++++++++++++++++++++
 gcc/config/bpf/bpf.md                    |  36 +++++++
 gcc/testsuite/gcc.target/bpf/memcpy-1.c  |  26 +++++
 gcc/testsuite/gcc.target/bpf/memmove-1.c |  46 +++++++++
 gcc/testsuite/gcc.target/bpf/memmove-2.c |  23 +++++
 6 files changed, 249 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/bpf/memcpy-1.c
 create mode 100644 gcc/testsuite/gcc.target/bpf/memmove-1.c
 create mode 100644 gcc/testsuite/gcc.target/bpf/memmove-2.c
  

Comments

Jose E. Marchesi Feb. 20, 2024, 8:37 p.m. UTC | #1
Hi Faust.

> +bool
> +bpf_expand_cpymem (rtx *operands, bool is_move)
> +{
> +  /* Size must be constant for this expansion to work.  */
> +  if (!CONST_INT_P (operands[2]))
> +    {
> +      const char *name = is_move ? "memmove" : "memcpy";
> +      if (flag_building_libgcc)
> +	warning (RTL_LOCATION (operands[2]),
> +		 "could not expand call to %<__builtin_%s%> inline: "
> +		 "size must be constant", name);
> +      else
> +	error ("could not expand call to %<__builtin_%s%> inline: "
> +	       "size must be constant", name);
> +      return false;
> +    }

I think you want to use warning_at and error_at above...  the first
argument to `warning' (and second to warning_at) is not a location but
an OPT_W* value.  You should pass 0 to the opt argument in this case as
there is no -W option to handle this warning.

Also, I would not mention 'expand' in the error/warning message, as it
is a GCC internal concept.  I would use perhaps "could not inline call
to %<__builtin_%s%>".

> +  /* Alignment is a CONST_INT.  */
> +  gcc_assert (CONST_INT_P (operands[3]));
> +
> +  rtx dst = operands[0];
> +  rtx src = operands[1];
> +  rtx size = operands[2];
> +  unsigned HOST_WIDE_INT size_bytes = UINTVAL (size);
> +  unsigned align = UINTVAL (operands[3]);
> +  enum machine_mode mode;
> +  switch (align)
> +    {
> +    case 1: mode = QImode; break;
> +    case 2: mode = HImode; break;
> +    case 4: mode = SImode; break;
> +    case 8: mode = DImode; break;
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  unsigned iters = size_bytes >> ceil_log2 (align);
> +  unsigned remainder = size_bytes & (align - 1);
> +
> +  int inc = GET_MODE_SIZE (mode);
> +  rtx_code_label *fwd_label, *done_label;
> +  if (is_move)
> +    {
> +      /* For memmove, be careful of overlap.  It is not a concern for memcpy.
> +	 To handle overlap, we check (at runtime) if SRC < DST, and if so do
> +	 the move "backwards" starting from SRC + SIZE.  */
> +      fwd_label = gen_label_rtx ();
> +      done_label = gen_label_rtx ();
> +
> +      rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (dst, 0));
> +      rtx src_addr = copy_to_mode_reg (Pmode, XEXP (src, 0));
> +      emit_cmp_and_jump_insns (src_addr, dst_addr, GEU, NULL_RTX, Pmode,
> +			       true, fwd_label, profile_probability::even ());
> +
> +      /* Emit the "backwards" unrolled loop.  */
> +      emit_move_loop (src, dst, mode, size_bytes, -inc, iters, remainder);
> +      emit_jump_insn (gen_jump (done_label));
> +      emit_barrier ();
> +
> +      emit_label (fwd_label);
> +    }
> +
> +  emit_move_loop (src, dst, mode, 0, inc, iters, remainder);
> +
> +  if (is_move)
> +    emit_label (done_label);
> +
> +  return true;
> +}
> +
>  /* Finally, build the GCC target.  */
>  
>  struct gcc_target targetm = TARGET_INITIALIZER;
> diff --git a/gcc/config/bpf/bpf.md b/gcc/config/bpf/bpf.md
> index 50df1aaa3e2..ca677bc6b50 100644
> --- a/gcc/config/bpf/bpf.md
> +++ b/gcc/config/bpf/bpf.md
> @@ -627,4 +627,40 @@ (define_insn "ldabs<ldop>"
>    "{ldabs<ldop>\t%0|r0 = *(<pldop> *) skb[%0]}"
>    [(set_attr "type" "ld")])
>  
> +;;; memmove and memcopy
> +
> +;; 0 is dst
> +;; 1 is src
> +;; 2 is size of copy in bytes
> +;; 3 is alignment
> +
> +(define_expand "cpymemdi"
> +  [(match_operand:BLK 0 "memory_operand")
> +   (match_operand:BLK 1 "memory_operand")
> +   (match_operand:DI 2 "general_operand")
> +   (match_operand:DI 3 "immediate_operand")]
> +   ""
> +{
> +  if (bpf_expand_cpymem (operands, false))
> +    DONE;
> +  FAIL;
> +})
> +
> +;; 0 is dst
> +;; 1 is src
> +;; 2 is size of copy in bytes
> +;; 3 is alignment
> +
> +(define_expand "movmemdi"
> +  [(match_operand:BLK 0 "memory_operand")
> +   (match_operand:BLK 1 "memory_operand")
> +   (match_operand:DI 2 "general_operand")
> +   (match_operand:DI 3 "immediate_operand")]
> +   ""
> +{
> +  if (bpf_expand_cpymem (operands, true))
> +    DONE;
> +  FAIL;
> +})
> +
>  (include "atomic.md")
> diff --git a/gcc/testsuite/gcc.target/bpf/memcpy-1.c b/gcc/testsuite/gcc.target/bpf/memcpy-1.c
> new file mode 100644
> index 00000000000..6c9707f24e8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/bpf/memcpy-1.c
> @@ -0,0 +1,26 @@
> +/* Ensure memcpy is expanded inline rather than emitting a libcall.  */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +struct context {
> + unsigned int data;
> + unsigned int data_end;
> + unsigned int data_meta;
> + unsigned int ingress;
> + unsigned int queue_index;
> + unsigned int egress;
> +};
> +
> +void
> +cpy_1(struct context *ctx)
> +{
> +  void *data = (void *)(long)ctx->data;
> +  char *dest;
> +  dest = data;
> +  dest += 16;
> +
> +  __builtin_memcpy (dest, data, 8);
> +}
> +
> +/* { dg-final { scan-assembler-times "call" 0 } } */
> diff --git a/gcc/testsuite/gcc.target/bpf/memmove-1.c b/gcc/testsuite/gcc.target/bpf/memmove-1.c
> new file mode 100644
> index 00000000000..3b8ba82639e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/bpf/memmove-1.c
> @@ -0,0 +1,46 @@
> +/* Ensure memmove is expanded inline rather than emitting a libcall.  */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +struct context {
> + unsigned int data;
> + unsigned int data_end;
> + unsigned int data_meta;
> + unsigned int ingress;
> + unsigned int queue_index;
> + unsigned int egress;
> +};
> +
> +void
> +mov_1_nooverlap (struct context *ctx)
> +{
> +  void *data = (void *)(long)ctx->data;
> +  char *dest;
> +  dest = data;
> +  dest += 16;
> +
> +  __builtin_memmove (dest, data, 12);
> +}
> +
> +void
> +mov_1_overlap (struct context *ctx)
> +{
> +  void *data = (void *)(long)ctx->data;
> +  char *dest;
> +  dest = data;
> +  dest += 4;
> +
> +  __builtin_memmove (dest, data, 12);
> +}
> +
> +void
> +mov_1_arbitrary (struct context *ctx_a, struct context *ctx_b)
> +{
> +  void *src = (void *)(long)ctx_a->data;
> +  void *dst = (void *)(long)ctx_b->data;
> +
> +  __builtin_memmove (dst, src, 12);
> +}
> +
> +/* { dg-final { scan-assembler-times "call" 0 } } */
> diff --git a/gcc/testsuite/gcc.target/bpf/memmove-2.c b/gcc/testsuite/gcc.target/bpf/memmove-2.c
> new file mode 100644
> index 00000000000..7fae96095a3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/bpf/memmove-2.c
> @@ -0,0 +1,23 @@
> +/* Test that we error if memmove cannot be expanded inline.  */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +typedef unsigned int __u32;
> +
> +struct context {
> + unsigned int data;
> + unsigned int data_end;
> + unsigned int data_meta;
> +};
> +
> +void
> +mov_2_unsupported (struct context *ctx)
> +{
> +  void *data = (void *)(long)ctx->data;
> +  char *dest;
> +  dest = data;
> +  dest += 4;
> +
> +  __builtin_memmove (dest, data, ctx->data_meta); /* { dg-error "could not expand" } */
> +}
  
David Faust Feb. 20, 2024, 10:49 p.m. UTC | #2
On 2/20/24 12:37, Jose E. Marchesi wrote:
> 
> Hi Faust.
> 
>> +bool
>> +bpf_expand_cpymem (rtx *operands, bool is_move)
>> +{
>> +  /* Size must be constant for this expansion to work.  */
>> +  if (!CONST_INT_P (operands[2]))
>> +    {
>> +      const char *name = is_move ? "memmove" : "memcpy";
>> +      if (flag_building_libgcc)
>> +	warning (RTL_LOCATION (operands[2]),
>> +		 "could not expand call to %<__builtin_%s%> inline: "
>> +		 "size must be constant", name);
>> +      else
>> +	error ("could not expand call to %<__builtin_%s%> inline: "
>> +	       "size must be constant", name);
>> +      return false;
>> +    }
> 
> I think you want to use warning_at and error_at above...  the first
> argument to `warning' (and second to warning_at) is not a location but
> an OPT_W* value.  You should pass 0 to the opt argument in this case as
> there is no -W option to handle this warning.

Oops. Thanks.

After fixing the arguments, `warning' and `error' generally give better
messages than `warning_at' and `error_at' since the
RTL_LOCATION (operands[2]) seems to be NULL for most (all?) cases, which
causes the diagnostic message to refer only to the containing function.
The non-`_at' versions highlight the builtin call being expanded, which
is much nicer.

> 
> Also, I would not mention 'expand' in the error/warning message, as it
> is a GCC internal concept.  I would use perhaps "could not inline call
> to %<__builtin_%s%>".

OK.

> 
>> +  /* Alignment is a CONST_INT.  */
>> +  gcc_assert (CONST_INT_P (operands[3]));
>> +
>> +  rtx dst = operands[0];
>> +  rtx src = operands[1];
>> +  rtx size = operands[2];
>> +  unsigned HOST_WIDE_INT size_bytes = UINTVAL (size);
>> +  unsigned align = UINTVAL (operands[3]);
>> +  enum machine_mode mode;
>> +  switch (align)
>> +    {
>> +    case 1: mode = QImode; break;
>> +    case 2: mode = HImode; break;
>> +    case 4: mode = SImode; break;
>> +    case 8: mode = DImode; break;
>> +    default:
>> +      gcc_unreachable ();
>> +    }
>> +
>> +  unsigned iters = size_bytes >> ceil_log2 (align);
>> +  unsigned remainder = size_bytes & (align - 1);
>> +
>> +  int inc = GET_MODE_SIZE (mode);
>> +  rtx_code_label *fwd_label, *done_label;
>> +  if (is_move)
>> +    {
>> +      /* For memmove, be careful of overlap.  It is not a concern for memcpy.
>> +	 To handle overlap, we check (at runtime) if SRC < DST, and if so do
>> +	 the move "backwards" starting from SRC + SIZE.  */
>> +      fwd_label = gen_label_rtx ();
>> +      done_label = gen_label_rtx ();
>> +
>> +      rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>> +      rtx src_addr = copy_to_mode_reg (Pmode, XEXP (src, 0));
>> +      emit_cmp_and_jump_insns (src_addr, dst_addr, GEU, NULL_RTX, Pmode,
>> +			       true, fwd_label, profile_probability::even ());
>> +
>> +      /* Emit the "backwards" unrolled loop.  */
>> +      emit_move_loop (src, dst, mode, size_bytes, -inc, iters, remainder);
>> +      emit_jump_insn (gen_jump (done_label));
>> +      emit_barrier ();
>> +
>> +      emit_label (fwd_label);
>> +    }
>> +
>> +  emit_move_loop (src, dst, mode, 0, inc, iters, remainder);
>> +
>> +  if (is_move)
>> +    emit_label (done_label);
>> +
>> +  return true;
>> +}
>> +
>>  /* Finally, build the GCC target.  */
>>  
>>  struct gcc_target targetm = TARGET_INITIALIZER;
>> diff --git a/gcc/config/bpf/bpf.md b/gcc/config/bpf/bpf.md
>> index 50df1aaa3e2..ca677bc6b50 100644
>> --- a/gcc/config/bpf/bpf.md
>> +++ b/gcc/config/bpf/bpf.md
>> @@ -627,4 +627,40 @@ (define_insn "ldabs<ldop>"
>>    "{ldabs<ldop>\t%0|r0 = *(<pldop> *) skb[%0]}"
>>    [(set_attr "type" "ld")])
>>  
>> +;;; memmove and memcopy
>> +
>> +;; 0 is dst
>> +;; 1 is src
>> +;; 2 is size of copy in bytes
>> +;; 3 is alignment
>> +
>> +(define_expand "cpymemdi"
>> +  [(match_operand:BLK 0 "memory_operand")
>> +   (match_operand:BLK 1 "memory_operand")
>> +   (match_operand:DI 2 "general_operand")
>> +   (match_operand:DI 3 "immediate_operand")]
>> +   ""
>> +{
>> +  if (bpf_expand_cpymem (operands, false))
>> +    DONE;
>> +  FAIL;
>> +})
>> +
>> +;; 0 is dst
>> +;; 1 is src
>> +;; 2 is size of copy in bytes
>> +;; 3 is alignment
>> +
>> +(define_expand "movmemdi"
>> +  [(match_operand:BLK 0 "memory_operand")
>> +   (match_operand:BLK 1 "memory_operand")
>> +   (match_operand:DI 2 "general_operand")
>> +   (match_operand:DI 3 "immediate_operand")]
>> +   ""
>> +{
>> +  if (bpf_expand_cpymem (operands, true))
>> +    DONE;
>> +  FAIL;
>> +})
>> +
>>  (include "atomic.md")
>> diff --git a/gcc/testsuite/gcc.target/bpf/memcpy-1.c b/gcc/testsuite/gcc.target/bpf/memcpy-1.c
>> new file mode 100644
>> index 00000000000..6c9707f24e8
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/bpf/memcpy-1.c
>> @@ -0,0 +1,26 @@
>> +/* Ensure memcpy is expanded inline rather than emitting a libcall.  */
>> +
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" } */
>> +
>> +struct context {
>> + unsigned int data;
>> + unsigned int data_end;
>> + unsigned int data_meta;
>> + unsigned int ingress;
>> + unsigned int queue_index;
>> + unsigned int egress;
>> +};
>> +
>> +void
>> +cpy_1(struct context *ctx)
>> +{
>> +  void *data = (void *)(long)ctx->data;
>> +  char *dest;
>> +  dest = data;
>> +  dest += 16;
>> +
>> +  __builtin_memcpy (dest, data, 8);
>> +}
>> +
>> +/* { dg-final { scan-assembler-times "call" 0 } } */
>> diff --git a/gcc/testsuite/gcc.target/bpf/memmove-1.c b/gcc/testsuite/gcc.target/bpf/memmove-1.c
>> new file mode 100644
>> index 00000000000..3b8ba82639e
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/bpf/memmove-1.c
>> @@ -0,0 +1,46 @@
>> +/* Ensure memmove is expanded inline rather than emitting a libcall.  */
>> +
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" } */
>> +
>> +struct context {
>> + unsigned int data;
>> + unsigned int data_end;
>> + unsigned int data_meta;
>> + unsigned int ingress;
>> + unsigned int queue_index;
>> + unsigned int egress;
>> +};
>> +
>> +void
>> +mov_1_nooverlap (struct context *ctx)
>> +{
>> +  void *data = (void *)(long)ctx->data;
>> +  char *dest;
>> +  dest = data;
>> +  dest += 16;
>> +
>> +  __builtin_memmove (dest, data, 12);
>> +}
>> +
>> +void
>> +mov_1_overlap (struct context *ctx)
>> +{
>> +  void *data = (void *)(long)ctx->data;
>> +  char *dest;
>> +  dest = data;
>> +  dest += 4;
>> +
>> +  __builtin_memmove (dest, data, 12);
>> +}
>> +
>> +void
>> +mov_1_arbitrary (struct context *ctx_a, struct context *ctx_b)
>> +{
>> +  void *src = (void *)(long)ctx_a->data;
>> +  void *dst = (void *)(long)ctx_b->data;
>> +
>> +  __builtin_memmove (dst, src, 12);
>> +}
>> +
>> +/* { dg-final { scan-assembler-times "call" 0 } } */
>> diff --git a/gcc/testsuite/gcc.target/bpf/memmove-2.c b/gcc/testsuite/gcc.target/bpf/memmove-2.c
>> new file mode 100644
>> index 00000000000..7fae96095a3
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/bpf/memmove-2.c
>> @@ -0,0 +1,23 @@
>> +/* Test that we error if memmove cannot be expanded inline.  */
>> +
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" } */
>> +
>> +typedef unsigned int __u32;
>> +
>> +struct context {
>> + unsigned int data;
>> + unsigned int data_end;
>> + unsigned int data_meta;
>> +};
>> +
>> +void
>> +mov_2_unsupported (struct context *ctx)
>> +{
>> +  void *data = (void *)(long)ctx->data;
>> +  char *dest;
>> +  dest = data;
>> +  dest += 4;
>> +
>> +  __builtin_memmove (dest, data, ctx->data_meta); /* { dg-error "could not expand" } */
>> +}
  

Patch

diff --git a/gcc/config/bpf/bpf-protos.h b/gcc/config/bpf/bpf-protos.h
index 46d950bd990..366acb87ae4 100644
--- a/gcc/config/bpf/bpf-protos.h
+++ b/gcc/config/bpf/bpf-protos.h
@@ -35,4 +35,6 @@  const char *bpf_add_core_reloc (rtx *operands, const char *templ);
 class gimple_opt_pass;
 gimple_opt_pass *make_pass_lower_bpf_core (gcc::context *ctxt);
 
+bool bpf_expand_cpymem (rtx *, bool);
+
 #endif /* ! GCC_BPF_PROTOS_H */
diff --git a/gcc/config/bpf/bpf.cc b/gcc/config/bpf/bpf.cc
index d6ca47eeecb..5db031a4551 100644
--- a/gcc/config/bpf/bpf.cc
+++ b/gcc/config/bpf/bpf.cc
@@ -1184,6 +1184,122 @@  bpf_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
   bpf_use_by_pieces_infrastructure_p
 
+/* Helper for bpf_expand_cpymem.  Emit an unrolled loop moving the bytes
+   from SRC to DST.  */
+
+static void
+emit_move_loop (rtx src, rtx dst, machine_mode mode, int offset, int inc,
+		unsigned iters, unsigned remainder)
+{
+  rtx reg = gen_reg_rtx (mode);
+
+  /* First copy in chunks as large as alignment permits.  */
+  for (unsigned int i = 0; i < iters; i++)
+    {
+      emit_move_insn (reg, adjust_address (src, mode, offset));
+      emit_move_insn (adjust_address (dst, mode, offset), reg);
+      offset += inc;
+    }
+
+  /* Handle remaining bytes which might be smaller than the chunks
+     used above.  */
+  if (remainder & 4)
+    {
+      emit_move_insn (reg, adjust_address (src, SImode, offset));
+      emit_move_insn (adjust_address (dst, SImode, offset), reg);
+      offset += (inc < 0 ? -4 : 4);
+      remainder -= 4;
+    }
+  if (remainder & 2)
+    {
+      emit_move_insn (reg, adjust_address (src, HImode, offset));
+      emit_move_insn (adjust_address (dst, HImode, offset), reg);
+      offset += (inc < 0 ? -2 : 2);
+      remainder -= 2;
+    }
+  if (remainder & 1)
+    {
+      emit_move_insn (reg, adjust_address (src, QImode, offset));
+      emit_move_insn (adjust_address (dst, QImode, offset), reg);
+    }
+}
+
+/* Expand cpymem/movmem, as from __builtin_memcpy/memmove.
+   OPERANDS are the same as the cpymem/movmem patterns.
+   IS_MOVE is true if this is a memmove, false for memcpy.
+   Return true if we successfully expanded, or false if we cannot
+   and must punt to a libcall.  */
+
+bool
+bpf_expand_cpymem (rtx *operands, bool is_move)
+{
+  /* Size must be constant for this expansion to work.  */
+  if (!CONST_INT_P (operands[2]))
+    {
+      const char *name = is_move ? "memmove" : "memcpy";
+      if (flag_building_libgcc)
+	warning (RTL_LOCATION (operands[2]),
+		 "could not expand call to %<__builtin_%s%> inline: "
+		 "size must be constant", name);
+      else
+	error ("could not expand call to %<__builtin_%s%> inline: "
+	       "size must be constant", name);
+      return false;
+    }
+
+  /* Alignment is a CONST_INT.  */
+  gcc_assert (CONST_INT_P (operands[3]));
+
+  rtx dst = operands[0];
+  rtx src = operands[1];
+  rtx size = operands[2];
+  unsigned HOST_WIDE_INT size_bytes = UINTVAL (size);
+  unsigned align = UINTVAL (operands[3]);
+  enum machine_mode mode;
+  switch (align)
+    {
+    case 1: mode = QImode; break;
+    case 2: mode = HImode; break;
+    case 4: mode = SImode; break;
+    case 8: mode = DImode; break;
+    default:
+      gcc_unreachable ();
+    }
+
+  unsigned iters = size_bytes >> ceil_log2 (align);
+  unsigned remainder = size_bytes & (align - 1);
+
+  int inc = GET_MODE_SIZE (mode);
+  rtx_code_label *fwd_label, *done_label;
+  if (is_move)
+    {
+      /* For memmove, be careful of overlap.  It is not a concern for memcpy.
+	 To handle overlap, we check (at runtime) if SRC < DST, and if so do
+	 the move "backwards" starting from SRC + SIZE.  */
+      fwd_label = gen_label_rtx ();
+      done_label = gen_label_rtx ();
+
+      rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (dst, 0));
+      rtx src_addr = copy_to_mode_reg (Pmode, XEXP (src, 0));
+      emit_cmp_and_jump_insns (src_addr, dst_addr, GEU, NULL_RTX, Pmode,
+			       true, fwd_label, profile_probability::even ());
+
+      /* Emit the "backwards" unrolled loop.  */
+      emit_move_loop (src, dst, mode, size_bytes, -inc, iters, remainder);
+      emit_jump_insn (gen_jump (done_label));
+      emit_barrier ();
+
+      emit_label (fwd_label);
+    }
+
+  emit_move_loop (src, dst, mode, 0, inc, iters, remainder);
+
+  if (is_move)
+    emit_label (done_label);
+
+  return true;
+}
+
 /* Finally, build the GCC target.  */
 
 struct gcc_target targetm = TARGET_INITIALIZER;
diff --git a/gcc/config/bpf/bpf.md b/gcc/config/bpf/bpf.md
index 50df1aaa3e2..ca677bc6b50 100644
--- a/gcc/config/bpf/bpf.md
+++ b/gcc/config/bpf/bpf.md
@@ -627,4 +627,40 @@  (define_insn "ldabs<ldop>"
   "{ldabs<ldop>\t%0|r0 = *(<pldop> *) skb[%0]}"
   [(set_attr "type" "ld")])
 
+;;; memmove and memcopy
+
+;; 0 is dst
+;; 1 is src
+;; 2 is size of copy in bytes
+;; 3 is alignment
+
+(define_expand "cpymemdi"
+  [(match_operand:BLK 0 "memory_operand")
+   (match_operand:BLK 1 "memory_operand")
+   (match_operand:DI 2 "general_operand")
+   (match_operand:DI 3 "immediate_operand")]
+   ""
+{
+  if (bpf_expand_cpymem (operands, false))
+    DONE;
+  FAIL;
+})
+
+;; 0 is dst
+;; 1 is src
+;; 2 is size of copy in bytes
+;; 3 is alignment
+
+(define_expand "movmemdi"
+  [(match_operand:BLK 0 "memory_operand")
+   (match_operand:BLK 1 "memory_operand")
+   (match_operand:DI 2 "general_operand")
+   (match_operand:DI 3 "immediate_operand")]
+   ""
+{
+  if (bpf_expand_cpymem (operands, true))
+    DONE;
+  FAIL;
+})
+
 (include "atomic.md")
diff --git a/gcc/testsuite/gcc.target/bpf/memcpy-1.c b/gcc/testsuite/gcc.target/bpf/memcpy-1.c
new file mode 100644
index 00000000000..6c9707f24e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/bpf/memcpy-1.c
@@ -0,0 +1,26 @@ 
+/* Ensure memcpy is expanded inline rather than emitting a libcall.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+struct context {
+ unsigned int data;
+ unsigned int data_end;
+ unsigned int data_meta;
+ unsigned int ingress;
+ unsigned int queue_index;
+ unsigned int egress;
+};
+
+void
+cpy_1(struct context *ctx)
+{
+  void *data = (void *)(long)ctx->data;
+  char *dest;
+  dest = data;
+  dest += 16;
+
+  __builtin_memcpy (dest, data, 8);
+}
+
+/* { dg-final { scan-assembler-times "call" 0 } } */
diff --git a/gcc/testsuite/gcc.target/bpf/memmove-1.c b/gcc/testsuite/gcc.target/bpf/memmove-1.c
new file mode 100644
index 00000000000..3b8ba82639e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/bpf/memmove-1.c
@@ -0,0 +1,46 @@ 
+/* Ensure memmove is expanded inline rather than emitting a libcall.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+struct context {
+ unsigned int data;
+ unsigned int data_end;
+ unsigned int data_meta;
+ unsigned int ingress;
+ unsigned int queue_index;
+ unsigned int egress;
+};
+
+void
+mov_1_nooverlap (struct context *ctx)
+{
+  void *data = (void *)(long)ctx->data;
+  char *dest;
+  dest = data;
+  dest += 16;
+
+  __builtin_memmove (dest, data, 12);
+}
+
+void
+mov_1_overlap (struct context *ctx)
+{
+  void *data = (void *)(long)ctx->data;
+  char *dest;
+  dest = data;
+  dest += 4;
+
+  __builtin_memmove (dest, data, 12);
+}
+
+void
+mov_1_arbitrary (struct context *ctx_a, struct context *ctx_b)
+{
+  void *src = (void *)(long)ctx_a->data;
+  void *dst = (void *)(long)ctx_b->data;
+
+  __builtin_memmove (dst, src, 12);
+}
+
+/* { dg-final { scan-assembler-times "call" 0 } } */
diff --git a/gcc/testsuite/gcc.target/bpf/memmove-2.c b/gcc/testsuite/gcc.target/bpf/memmove-2.c
new file mode 100644
index 00000000000..7fae96095a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/bpf/memmove-2.c
@@ -0,0 +1,23 @@ 
+/* Test that we error if memmove cannot be expanded inline.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef unsigned int __u32;
+
+struct context {
+ unsigned int data;
+ unsigned int data_end;
+ unsigned int data_meta;
+};
+
+void
+mov_2_unsupported (struct context *ctx)
+{
+  void *data = (void *)(long)ctx->data;
+  char *dest;
+  dest = data;
+  dest += 4;
+
+  __builtin_memmove (dest, data, ctx->data_meta); /* { dg-error "could not expand" } */
+}