PowerPC: memmove default implementation cleanup
Commit Message
This patch removes the powerpc specific logic in memmove and instead
include default implementation with MEMCPY_OK_FOR_FWD_MEMMOVE defined.
This lead in a increase performance, since the constraints to use
memcpy in powerpc code are too restrictive and memcpy can be used for
any forward memmove (and the rest to PPC implementation is just a copy
of default one).
Checked on powerpc32-fpu and powerpc64be. OK to apply?
--
* sysdeps/powerpc/memmove.c (memmove): Cleanup impplementation to use
glibc default one.
---
Comments
On Tue, Jun 24, 2014 at 11:21:30AM -0300, Adhemerval Zanella wrote:
> This patch removes the powerpc specific logic in memmove and instead
> include default implementation with MEMCPY_OK_FOR_FWD_MEMMOVE defined.
> This lead in a increase performance, since the constraints to use
> memcpy in powerpc code are too restrictive and memcpy can be used for
> any forward memmove (and the rest to PPC implementation is just a copy
> of default one).
>
> Checked on powerpc32-fpu and powerpc64be. OK to apply?
>
That looks ok, I would go step more and say that aliasing memcpy to
memmove will differ only by few cycles and could improve performance
due of cache locality but I did not do test yet.
On 24-06-2014 13:11, Ondřej Bílka wrote:
> On Tue, Jun 24, 2014 at 11:21:30AM -0300, Adhemerval Zanella wrote:
>> This patch removes the powerpc specific logic in memmove and instead
>> include default implementation with MEMCPY_OK_FOR_FWD_MEMMOVE defined.
>> This lead in a increase performance, since the constraints to use
>> memcpy in powerpc code are too restrictive and memcpy can be used for
>> any forward memmove (and the rest to PPC implementation is just a copy
>> of default one).
>>
>> Checked on powerpc32-fpu and powerpc64be. OK to apply?
>>
> That looks ok, I would go step more and say that aliasing memcpy to
> memmove will differ only by few cycles and could improve performance
> due of cache locality but I did not do test yet.
>
Thanks for review. Now with optimized power7, I think I can check if alias
shows a performance boost (although such few cycles will be hard to evaluate
due os jitter). However, I don't think we can enable now for all power
implementations (since for power4, power6, etc. that have optimized
memcpy, this will mean another branch).
@@ -18,101 +18,5 @@
License along with the GNU C Library; see the file COPYING.LIB. If
not, see <http://www.gnu.org/licenses/>. */
-#include <string.h>
-#include <memcopy.h>
-#include <pagecopy.h>
-
-/* All this is so that bcopy.c can #include
- this file after defining some things. */
-#ifndef a1
-#define a1 dest /* First arg is DEST. */
-#define a1const
-#define a2 src /* Second arg is SRC. */
-#define a2const const
-#undef memmove
-#endif
-#if !defined(RETURN) || !defined(rettype)
-#define RETURN(s) return (s) /* Return DEST. */
-#define rettype void *
-#endif
-
-#ifndef MEMMOVE
-#define MEMMOVE memmove
-#endif
-
-rettype
-MEMMOVE (a1, a2, len)
- a1const void *a1;
- a2const void *a2;
- size_t len;
-{
- unsigned long int dstp = (long int) dest;
- unsigned long int srcp = (long int) src;
-
- /* If there is no overlap between ranges, call the builtin memcpy. */
- if (dstp >= srcp + len || srcp > dstp + len)
- __builtin_memcpy (dest, src, len);
-
- /* This test makes the forward copying code be used whenever possible.
- Reduces the working set. */
- else if (dstp - srcp >= len) /* *Unsigned* compare! */
- {
- /* Copy from the beginning to the end. */
-
- /* If there not too few bytes to copy, use word copy. */
- if (len >= OP_T_THRES)
- {
- /* Copy just a few bytes to make DSTP aligned. */
- len -= (-dstp) % OPSIZ;
- BYTE_COPY_FWD (dstp, srcp, (-dstp) % OPSIZ);
-
- /* Copy whole pages from SRCP to DSTP by virtual address
- manipulation, as much as possible. */
-
- PAGE_COPY_FWD_MAYBE (dstp, srcp, len, len);
-
- /* Copy from SRCP to DSTP taking advantage of the known
- alignment of DSTP. Number of bytes remaining is put
- in the third argument, i.e. in LEN. This number may
- vary from machine to machine. */
-
- WORD_COPY_FWD (dstp, srcp, len, len);
-
- /* Fall out and copy the tail. */
- }
-
- /* There are just a few bytes to copy. Use byte memory operations. */
- BYTE_COPY_FWD (dstp, srcp, len);
- }
- else
- {
- /* Copy from the end to the beginning. */
- srcp += len;
- dstp += len;
-
- /* If there not too few bytes to copy, use word copy. */
- if (len >= OP_T_THRES)
- {
- /* Copy just a few bytes to make DSTP aligned. */
- len -= dstp % OPSIZ;
- BYTE_COPY_BWD (dstp, srcp, dstp % OPSIZ);
-
- /* Copy from SRCP to DSTP taking advantage of the known
- alignment of DSTP. Number of bytes remaining is put
- in the third argument, i.e. in LEN. This number may
- vary from machine to machine. */
-
- WORD_COPY_BWD (dstp, srcp, len, len);
-
- /* Fall out and copy the tail. */
- }
-
- /* There are just a few bytes to copy. Use byte memory operations. */
- BYTE_COPY_BWD (dstp, srcp, len);
- }
-
- RETURN (dest);
-}
-#ifndef memmove
-libc_hidden_builtin_def (memmove)
-#endif
+#define MEMCPY_OK_FOR_FWD_MEMMOVE 1
+#include <string/memmove.c>