From patchwork Tue Sep 14 06:30:37 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 44974 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 4664A3858436 for ; Tue, 14 Sep 2021 06:33:26 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 4664A3858436 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1631601206; bh=FFM0d0/seXcNtzd5Y/BrYUfl1OzBKHIyixwHMMVQKJM=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=y4u7Qg8ont4T91fD3pJlxp54qvHqQzbS6agOsl8kvpeOrEDBCmHw0khGxirx4Lz8E YTaCfAyCRAoiCRFTgbYZTzL42Vk4ms5JpyjvpXtLZ5VLgwuQjubmCY+GhVIu5sU6LT aSUfdmyssFgQTg9oQuMn1pKq2iCh/C1dLQtUXJj4= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-il1-x131.google.com (mail-il1-x131.google.com [IPv6:2607:f8b0:4864:20::131]) by sourceware.org (Postfix) with ESMTPS id 91B363858402 for ; Tue, 14 Sep 2021 06:30:54 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 91B363858402 Received: by mail-il1-x131.google.com with SMTP id h20so11936314ilj.13 for ; Mon, 13 Sep 2021 23:30:54 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=FFM0d0/seXcNtzd5Y/BrYUfl1OzBKHIyixwHMMVQKJM=; b=hFSD1+H+FOua5Kse/EYU8N9BtPGilwBHhsJmkPgQWaA7zNwMBx1sVr3kSW2GA2BAVN i4EbBwr0VYyS4D9337QLzf0s020+x6PQhFuqsetCwHsWGeagWJaYuh8+sxKFrYbjO2no PkZSRD4Fk3TjUZmxFvGrU7a9wzWaGsyMUMMjj/kWXJh+koa49wZNLzjRnRd5tgF2Hh9d 87vqfzti6kNTamC6ANRG5RSmMiTBs+VQnA/T3Fls2O6YFytvptnRI2fOoEpZAXToL2y5 yswaX7oQTEAdYhMqR3KMdeHOj+t7G5nx8P6OYz7L12l6GQnSg2AQvIP+4Vk/OYHEC4db 3loQ== X-Gm-Message-State: AOAM530pALlP83rhmOlPCrBKxIS8SczOhB1+6/DF2wcgJ6rlEV+oSbW6 4r9OSeX/4GGJlg2gZgkMiJfB5GEoFoGc2Q== X-Google-Smtp-Source: ABdhPJz/PDh5OIGXWqV5srkbdOgr1n77nNM0f/bh+yWmJuJP1NRvNKa668LahzsSZCS9De2Vyz1lrQ== X-Received: by 2002:a05:6e02:198d:: with SMTP id g13mr11283247ilf.319.1631601053356; Mon, 13 Sep 2021 23:30:53 -0700 (PDT) Received: from localhost.localdomain (node-17-161.flex.volo.net. [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id b10sm6101328ils.13.2021.09.13.23.30.52 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 13 Sep 2021 23:30:53 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v2 3/5] x86_64: Add sse4_1 optimized bcmp implementation in memcmp-sse4.S Date: Tue, 14 Sep 2021 01:30:37 -0500 Message-Id: <20210914063039.1126196-3-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20210914063039.1126196-1-goldstein.w.n@gmail.com> References: <20210913230506.546749-1-goldstein.w.n@gmail.com> <20210914063039.1126196-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-10.2 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SCC_10_SHORT_WORD_LINES, SCC_20_SHORT_WORD_LINES, SCC_35_SHORT_WORD_LINES, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" No bug. This commit does not modify any of the memcmp implementation. It just adds bcmp ifdefs to skip obvious cases where computing the proper 1/-1 required by memcmp is not needed. test-memcmp, test-bcmp, and test-wmemcmp are all passing. --- sysdeps/x86_64/multiarch/memcmp-sse4.S | 761 ++++++++++++++++++++++++- 1 file changed, 746 insertions(+), 15 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S index b82adcd5fa..b9528ed58e 100644 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -72,7 +72,11 @@ L(79bytesormore): movdqu (%rdi), %xmm2 pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(16bytesin256) +# endif mov %rsi, %rcx and $-16, %rsi add $16, %rsi @@ -91,34 +95,58 @@ L(less128bytes): movdqu (%rdi), %xmm2 pxor (%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(16bytesin256) +# endif movdqu 16(%rdi), %xmm2 pxor 16(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(32bytesin256) +# endif movdqu 32(%rdi), %xmm2 pxor 32(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(48bytesin256) +# endif movdqu 48(%rdi), %xmm2 pxor 48(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(64bytesin256) +# endif cmp $32, %rdx jb L(less32bytesin64) movdqu 64(%rdi), %xmm2 pxor 64(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(80bytesin256) +# endif movdqu 80(%rdi), %xmm2 pxor 80(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(96bytesin256) +# endif sub $32, %rdx add $32, %rdi add $32, %rsi @@ -140,42 +168,74 @@ L(less256bytes): movdqu (%rdi), %xmm2 pxor (%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(16bytesin256) +# endif movdqu 16(%rdi), %xmm2 pxor 16(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(32bytesin256) +# endif movdqu 32(%rdi), %xmm2 pxor 32(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(48bytesin256) +# endif movdqu 48(%rdi), %xmm2 pxor 48(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(64bytesin256) +# endif movdqu 64(%rdi), %xmm2 pxor 64(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(80bytesin256) +# endif movdqu 80(%rdi), %xmm2 pxor 80(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(96bytesin256) +# endif movdqu 96(%rdi), %xmm2 pxor 96(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(112bytesin256) +# endif movdqu 112(%rdi), %xmm2 pxor 112(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(128bytesin256) +# endif add $128, %rsi add $128, %rdi @@ -189,12 +249,20 @@ L(less256bytes): movdqu (%rdi), %xmm2 pxor (%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(16bytesin256) +# endif movdqu 16(%rdi), %xmm2 pxor 16(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(32bytesin256) +# endif sub $32, %rdx add $32, %rdi add $32, %rsi @@ -208,82 +276,146 @@ L(less512bytes): movdqu (%rdi), %xmm2 pxor (%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(16bytesin256) +# endif movdqu 16(%rdi), %xmm2 pxor 16(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(32bytesin256) +# endif movdqu 32(%rdi), %xmm2 pxor 32(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(48bytesin256) +# endif movdqu 48(%rdi), %xmm2 pxor 48(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(64bytesin256) +# endif movdqu 64(%rdi), %xmm2 pxor 64(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(80bytesin256) +# endif movdqu 80(%rdi), %xmm2 pxor 80(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(96bytesin256) +# endif movdqu 96(%rdi), %xmm2 pxor 96(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(112bytesin256) +# endif movdqu 112(%rdi), %xmm2 pxor 112(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(128bytesin256) +# endif movdqu 128(%rdi), %xmm2 pxor 128(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(144bytesin256) +# endif movdqu 144(%rdi), %xmm2 pxor 144(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(160bytesin256) +# endif movdqu 160(%rdi), %xmm2 pxor 160(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(176bytesin256) +# endif movdqu 176(%rdi), %xmm2 pxor 176(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(192bytesin256) +# endif movdqu 192(%rdi), %xmm2 pxor 192(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(208bytesin256) +# endif movdqu 208(%rdi), %xmm2 pxor 208(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(224bytesin256) +# endif movdqu 224(%rdi), %xmm2 pxor 224(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(240bytesin256) +# endif movdqu 240(%rdi), %xmm2 pxor 240(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(256bytesin256) +# endif add $256, %rsi add $256, %rdi @@ -300,12 +432,20 @@ L(less512bytes): movdqu (%rdi), %xmm2 pxor (%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(16bytesin256) +# endif movdqu 16(%rdi), %xmm2 pxor 16(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(32bytesin256) +# endif sub $32, %rdx add $32, %rdi add $32, %rsi @@ -346,7 +486,11 @@ L(64bytesormore_loop): por %xmm5, %xmm1 ptest %xmm1, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(64bytesormore_loop_end) +# endif add $64, %rsi add $64, %rdi sub $64, %rdx @@ -380,7 +524,11 @@ L(L2_L3_unaligned_128bytes_loop): por %xmm5, %xmm1 ptest %xmm1, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(64bytesormore_loop_end) +# endif add $64, %rsi add $64, %rdi sub $64, %rdx @@ -404,34 +552,58 @@ L(less128bytesin2aligned): movdqa (%rdi), %xmm2 pxor (%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(16bytesin256) +# endif movdqa 16(%rdi), %xmm2 pxor 16(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(32bytesin256) +# endif movdqa 32(%rdi), %xmm2 pxor 32(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(48bytesin256) +# endif movdqa 48(%rdi), %xmm2 pxor 48(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(64bytesin256) +# endif cmp $32, %rdx jb L(less32bytesin64in2alinged) movdqa 64(%rdi), %xmm2 pxor 64(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(80bytesin256) +# endif movdqa 80(%rdi), %xmm2 pxor 80(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(96bytesin256) +# endif sub $32, %rdx add $32, %rdi add $32, %rsi @@ -454,42 +626,74 @@ L(less256bytesin2alinged): movdqa (%rdi), %xmm2 pxor (%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(16bytesin256) +# endif movdqa 16(%rdi), %xmm2 pxor 16(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(32bytesin256) +# endif movdqa 32(%rdi), %xmm2 pxor 32(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(48bytesin256) +# endif movdqa 48(%rdi), %xmm2 pxor 48(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(64bytesin256) +# endif movdqa 64(%rdi), %xmm2 pxor 64(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(80bytesin256) +# endif movdqa 80(%rdi), %xmm2 pxor 80(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(96bytesin256) +# endif movdqa 96(%rdi), %xmm2 pxor 96(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(112bytesin256) +# endif movdqa 112(%rdi), %xmm2 pxor 112(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(128bytesin256) +# endif add $128, %rsi add $128, %rdi @@ -503,12 +707,20 @@ L(less256bytesin2alinged): movdqu (%rdi), %xmm2 pxor (%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(16bytesin256) +# endif movdqu 16(%rdi), %xmm2 pxor 16(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(32bytesin256) +# endif sub $32, %rdx add $32, %rdi add $32, %rsi @@ -524,82 +736,146 @@ L(256bytesormorein2aligned): movdqa (%rdi), %xmm2 pxor (%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(16bytesin256) +# endif movdqa 16(%rdi), %xmm2 pxor 16(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(32bytesin256) +# endif movdqa 32(%rdi), %xmm2 pxor 32(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(48bytesin256) +# endif movdqa 48(%rdi), %xmm2 pxor 48(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(64bytesin256) +# endif movdqa 64(%rdi), %xmm2 pxor 64(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(80bytesin256) +# endif movdqa 80(%rdi), %xmm2 pxor 80(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(96bytesin256) +# endif movdqa 96(%rdi), %xmm2 pxor 96(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(112bytesin256) +# endif movdqa 112(%rdi), %xmm2 pxor 112(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(128bytesin256) +# endif movdqa 128(%rdi), %xmm2 pxor 128(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(144bytesin256) +# endif movdqa 144(%rdi), %xmm2 pxor 144(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(160bytesin256) +# endif movdqa 160(%rdi), %xmm2 pxor 160(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(176bytesin256) +# endif movdqa 176(%rdi), %xmm2 pxor 176(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(192bytesin256) +# endif movdqa 192(%rdi), %xmm2 pxor 192(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(208bytesin256) +# endif movdqa 208(%rdi), %xmm2 pxor 208(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(224bytesin256) +# endif movdqa 224(%rdi), %xmm2 pxor 224(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(240bytesin256) +# endif movdqa 240(%rdi), %xmm2 pxor 240(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(256bytesin256) +# endif add $256, %rsi add $256, %rdi @@ -616,12 +892,20 @@ L(256bytesormorein2aligned): movdqa (%rdi), %xmm2 pxor (%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(16bytesin256) +# endif movdqa 16(%rdi), %xmm2 pxor 16(%rsi), %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(32bytesin256) +# endif sub $32, %rdx add $32, %rdi add $32, %rsi @@ -663,7 +947,11 @@ L(64bytesormore_loopin2aligned): por %xmm5, %xmm1 ptest %xmm1, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(64bytesormore_loop_end) +# endif add $64, %rsi add $64, %rdi sub $64, %rdx @@ -697,7 +985,11 @@ L(L2_L3_aligned_128bytes_loop): por %xmm5, %xmm1 ptest %xmm1, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(64bytesormore_loop_end) +# endif add $64, %rsi add $64, %rdi sub $64, %rdx @@ -708,7 +1000,7 @@ L(L2_L3_aligned_128bytes_loop): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - +# ifndef USE_AS_BCMP .p2align 4 L(64bytesormore_loop_end): add $16, %rdi @@ -791,17 +1083,29 @@ L(32bytesin256): L(16bytesin256): add $16, %rdi add $16, %rsi +# endif L(16bytes): mov -16(%rdi), %rax mov -16(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif L(8bytes): mov -8(%rdi), %rax mov -8(%rsi), %rcx +# ifdef USE_AS_BCMP + sub %rcx, %rax + mov %rax, %rcx + shr $32, %rcx + or %ecx, %eax +# else cmp %rax, %rcx jne L(diffin8bytes) xor %eax, %eax +# endif ret .p2align 4 @@ -809,16 +1113,26 @@ L(12bytes): mov -12(%rdi), %rax mov -12(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif L(4bytes): mov -4(%rsi), %ecx -# ifndef USE_AS_WMEMCMP +# ifdef USE_AS_BCMP mov -4(%rdi), %eax - cmp %eax, %ecx + sub %ecx, %eax + ret # else +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax + cmp %eax, %ecx +# else cmp -4(%rdi), %ecx -# endif +# endif jne L(diffin4bytes) +# endif L(0bytes): xor %eax, %eax ret @@ -832,31 +1146,51 @@ L(65bytes): mov $-65, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(49bytes): movdqu -49(%rdi), %xmm1 movdqu -49(%rsi), %xmm2 mov $-49, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(33bytes): movdqu -33(%rdi), %xmm1 movdqu -33(%rsi), %xmm2 mov $-33, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(17bytes): mov -17(%rdi), %rax mov -17(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif L(9bytes): mov -9(%rdi), %rax mov -9(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif movzbl -1(%rdi), %eax movzbl -1(%rsi), %edx sub %edx, %eax @@ -867,12 +1201,23 @@ L(13bytes): mov -13(%rdi), %rax mov -13(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif mov -8(%rdi), %rax mov -8(%rsi), %rcx +# ifdef USE_AS_BCMP + sub %rcx, %rax + mov %rax, %rcx + shr $32, %rcx + or %ecx, %eax +# else cmp %rax, %rcx jne L(diffin8bytes) xor %eax, %eax +# endif ret .p2align 4 @@ -880,7 +1225,11 @@ L(5bytes): mov -5(%rdi), %eax mov -5(%rsi), %ecx cmp %eax, %ecx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin4bytes) +# endif movzbl -1(%rdi), %eax movzbl -1(%rsi), %edx sub %edx, %eax @@ -893,37 +1242,59 @@ L(66bytes): mov $-66, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(50bytes): movdqu -50(%rdi), %xmm1 movdqu -50(%rsi), %xmm2 mov $-50, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(34bytes): movdqu -34(%rdi), %xmm1 movdqu -34(%rsi), %xmm2 mov $-34, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(18bytes): mov -18(%rdi), %rax mov -18(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif L(10bytes): mov -10(%rdi), %rax mov -10(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif movzwl -2(%rdi), %eax movzwl -2(%rsi), %ecx +# ifndef USE_AS_BCMP cmp %cl, %al jne L(end) and $0xffff, %eax and $0xffff, %ecx +# endif sub %ecx, %eax ret @@ -932,12 +1303,23 @@ L(14bytes): mov -14(%rdi), %rax mov -14(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif mov -8(%rdi), %rax mov -8(%rsi), %rcx +# ifdef USE_AS_BCMP + sub %rcx, %rax + mov %rax, %rcx + shr $32, %rcx + or %ecx, %eax +# else cmp %rax, %rcx jne L(diffin8bytes) xor %eax, %eax +# endif ret .p2align 4 @@ -945,14 +1327,20 @@ L(6bytes): mov -6(%rdi), %eax mov -6(%rsi), %ecx cmp %eax, %ecx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin4bytes) +# endif L(2bytes): movzwl -2(%rsi), %ecx movzwl -2(%rdi), %eax +# ifndef USE_AS_BCMP cmp %cl, %al jne L(end) and $0xffff, %eax and $0xffff, %ecx +# endif sub %ecx, %eax ret @@ -963,36 +1351,60 @@ L(67bytes): mov $-67, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(51bytes): movdqu -51(%rdi), %xmm2 movdqu -51(%rsi), %xmm1 mov $-51, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(35bytes): movdqu -35(%rsi), %xmm1 movdqu -35(%rdi), %xmm2 mov $-35, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(19bytes): mov -19(%rdi), %rax mov -19(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif L(11bytes): mov -11(%rdi), %rax mov -11(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif mov -4(%rdi), %eax mov -4(%rsi), %ecx +# ifdef USE_AS_BCMP + sub %ecx, %eax +# else cmp %eax, %ecx jne L(diffin4bytes) xor %eax, %eax +# endif ret .p2align 4 @@ -1000,12 +1412,23 @@ L(15bytes): mov -15(%rdi), %rax mov -15(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif mov -8(%rdi), %rax mov -8(%rsi), %rcx +# ifdef USE_AS_BCMP + sub %rcx, %rax + mov %rax, %rcx + shr $32, %rcx + or %ecx, %eax +# else cmp %rax, %rcx jne L(diffin8bytes) xor %eax, %eax +# endif ret .p2align 4 @@ -1013,12 +1436,20 @@ L(7bytes): mov -7(%rdi), %eax mov -7(%rsi), %ecx cmp %eax, %ecx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin4bytes) +# endif mov -4(%rdi), %eax mov -4(%rsi), %ecx +# ifdef USE_AS_BCMP + sub %ecx, %eax +# else cmp %eax, %ecx jne L(diffin4bytes) xor %eax, %eax +# endif ret .p2align 4 @@ -1026,7 +1457,11 @@ L(3bytes): movzwl -3(%rdi), %eax movzwl -3(%rsi), %ecx cmp %eax, %ecx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin2bytes) +# endif L(1bytes): movzbl -1(%rdi), %eax movzbl -1(%rsi), %ecx @@ -1041,38 +1476,58 @@ L(68bytes): mov $-68, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(52bytes): movdqu -52(%rdi), %xmm2 movdqu -52(%rsi), %xmm1 mov $-52, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(36bytes): movdqu -36(%rdi), %xmm2 movdqu -36(%rsi), %xmm1 mov $-36, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(20bytes): movdqu -20(%rdi), %xmm2 movdqu -20(%rsi), %xmm1 mov $-20, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -4(%rsi), %ecx - -# ifndef USE_AS_WMEMCMP +# ifdef USE_AS_BCMP mov -4(%rdi), %eax - cmp %eax, %ecx + sub %ecx, %eax # else +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax + cmp %eax, %ecx +# else cmp -4(%rdi), %ecx -# endif +# endif jne L(diffin4bytes) xor %eax, %eax +# endif ret # ifndef USE_AS_WMEMCMP @@ -1084,32 +1539,52 @@ L(69bytes): mov $-69, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(53bytes): movdqu -53(%rsi), %xmm1 movdqu -53(%rdi), %xmm2 mov $-53, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(37bytes): movdqu -37(%rsi), %xmm1 movdqu -37(%rdi), %xmm2 mov $-37, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(21bytes): movdqu -21(%rsi), %xmm1 movdqu -21(%rdi), %xmm2 mov $-21, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -8(%rdi), %rax mov -8(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif xor %eax, %eax ret @@ -1120,32 +1595,52 @@ L(70bytes): mov $-70, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(54bytes): movdqu -54(%rsi), %xmm1 movdqu -54(%rdi), %xmm2 mov $-54, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(38bytes): movdqu -38(%rsi), %xmm1 movdqu -38(%rdi), %xmm2 mov $-38, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(22bytes): movdqu -22(%rsi), %xmm1 movdqu -22(%rdi), %xmm2 mov $-22, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -8(%rdi), %rax mov -8(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif xor %eax, %eax ret @@ -1156,32 +1651,52 @@ L(71bytes): mov $-71, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(55bytes): movdqu -55(%rdi), %xmm2 movdqu -55(%rsi), %xmm1 mov $-55, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(39bytes): movdqu -39(%rdi), %xmm2 movdqu -39(%rsi), %xmm1 mov $-39, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(23bytes): movdqu -23(%rdi), %xmm2 movdqu -23(%rsi), %xmm1 mov $-23, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -8(%rdi), %rax mov -8(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif xor %eax, %eax ret # endif @@ -1193,33 +1708,53 @@ L(72bytes): mov $-72, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(56bytes): movdqu -56(%rdi), %xmm2 movdqu -56(%rsi), %xmm1 mov $-56, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(40bytes): movdqu -40(%rdi), %xmm2 movdqu -40(%rsi), %xmm1 mov $-40, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(24bytes): movdqu -24(%rdi), %xmm2 movdqu -24(%rsi), %xmm1 mov $-24, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -8(%rsi), %rcx mov -8(%rdi), %rax cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif xor %eax, %eax ret @@ -1232,32 +1767,52 @@ L(73bytes): mov $-73, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(57bytes): movdqu -57(%rdi), %xmm2 movdqu -57(%rsi), %xmm1 mov $-57, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(41bytes): movdqu -41(%rdi), %xmm2 movdqu -41(%rsi), %xmm1 mov $-41, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(25bytes): movdqu -25(%rdi), %xmm2 movdqu -25(%rsi), %xmm1 mov $-25, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -9(%rdi), %rax mov -9(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif movzbl -1(%rdi), %eax movzbl -1(%rsi), %ecx sub %ecx, %eax @@ -1270,35 +1825,60 @@ L(74bytes): mov $-74, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(58bytes): movdqu -58(%rdi), %xmm2 movdqu -58(%rsi), %xmm1 mov $-58, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(42bytes): movdqu -42(%rdi), %xmm2 movdqu -42(%rsi), %xmm1 mov $-42, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(26bytes): movdqu -26(%rdi), %xmm2 movdqu -26(%rsi), %xmm1 mov $-26, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -10(%rdi), %rax mov -10(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif movzwl -2(%rdi), %eax movzwl -2(%rsi), %ecx +# ifdef USE_AS_BCMP + sub %ecx, %eax + ret +# else jmp L(diffin2bytes) +# endif .p2align 4 L(75bytes): @@ -1307,37 +1887,61 @@ L(75bytes): mov $-75, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(59bytes): movdqu -59(%rdi), %xmm2 movdqu -59(%rsi), %xmm1 mov $-59, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(43bytes): movdqu -43(%rdi), %xmm2 movdqu -43(%rsi), %xmm1 mov $-43, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(27bytes): movdqu -27(%rdi), %xmm2 movdqu -27(%rsi), %xmm1 mov $-27, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -11(%rdi), %rax mov -11(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif mov -4(%rdi), %eax mov -4(%rsi), %ecx +# ifdef USE_AS_BCMP + sub %ecx, %eax +# else cmp %eax, %ecx jne L(diffin4bytes) xor %eax, %eax +# endif ret # endif .p2align 4 @@ -1347,41 +1951,66 @@ L(76bytes): mov $-76, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(60bytes): movdqu -60(%rdi), %xmm2 movdqu -60(%rsi), %xmm1 mov $-60, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(44bytes): movdqu -44(%rdi), %xmm2 movdqu -44(%rsi), %xmm1 mov $-44, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(28bytes): movdqu -28(%rdi), %xmm2 movdqu -28(%rsi), %xmm1 mov $-28, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -12(%rdi), %rax mov -12(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif mov -4(%rsi), %ecx -# ifndef USE_AS_WMEMCMP +# ifdef USE_AS_BCMP mov -4(%rdi), %eax - cmp %eax, %ecx + sub %ecx, %eax # else +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax + cmp %eax, %ecx +# else cmp -4(%rdi), %ecx -# endif +# endif jne L(diffin4bytes) xor %eax, %eax +# endif ret # ifndef USE_AS_WMEMCMP @@ -1393,38 +2022,62 @@ L(77bytes): mov $-77, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(61bytes): movdqu -61(%rdi), %xmm2 movdqu -61(%rsi), %xmm1 mov $-61, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(45bytes): movdqu -45(%rdi), %xmm2 movdqu -45(%rsi), %xmm1 mov $-45, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(29bytes): movdqu -29(%rdi), %xmm2 movdqu -29(%rsi), %xmm1 mov $-29, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -13(%rdi), %rax mov -13(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif mov -8(%rdi), %rax mov -8(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif xor %eax, %eax ret @@ -1435,36 +2088,60 @@ L(78bytes): mov $-78, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(62bytes): movdqu -62(%rdi), %xmm2 movdqu -62(%rsi), %xmm1 mov $-62, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(46bytes): movdqu -46(%rdi), %xmm2 movdqu -46(%rsi), %xmm1 mov $-46, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(30bytes): movdqu -30(%rdi), %xmm2 movdqu -30(%rsi), %xmm1 mov $-30, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -14(%rdi), %rax mov -14(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif mov -8(%rdi), %rax mov -8(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif xor %eax, %eax ret @@ -1475,36 +2152,60 @@ L(79bytes): mov $-79, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(63bytes): movdqu -63(%rdi), %xmm2 movdqu -63(%rsi), %xmm1 mov $-63, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(47bytes): movdqu -47(%rdi), %xmm2 movdqu -47(%rsi), %xmm1 mov $-47, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(31bytes): movdqu -31(%rdi), %xmm2 movdqu -31(%rsi), %xmm1 mov $-31, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -15(%rdi), %rax mov -15(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif mov -8(%rdi), %rax mov -8(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif xor %eax, %eax ret # endif @@ -1515,37 +2216,58 @@ L(64bytes): mov $-64, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(48bytes): movdqu -48(%rdi), %xmm2 movdqu -48(%rsi), %xmm1 mov $-48, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif L(32bytes): movdqu -32(%rdi), %xmm2 movdqu -32(%rsi), %xmm1 mov $-32, %dl pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 +# ifdef USE_AS_BCMP + jnc L(return_not_equals) +# else jnc L(less16bytes) +# endif mov -16(%rdi), %rax mov -16(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif mov -8(%rdi), %rax mov -8(%rsi), %rcx cmp %rax, %rcx +# ifdef USE_AS_BCMP + jne L(return_not_equals) +# else jne L(diffin8bytes) +# endif xor %eax, %eax ret /* * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. */ +# ifndef USE_AS_BCMP .p2align 3 L(less16bytes): movsbq %dl, %rdx @@ -1561,16 +2283,16 @@ L(diffin8bytes): shr $32, %rcx shr $32, %rax -# ifdef USE_AS_WMEMCMP +# ifdef USE_AS_WMEMCMP /* for wmemcmp */ cmp %eax, %ecx jne L(diffin4bytes) xor %eax, %eax ret -# endif +# endif L(diffin4bytes): -# ifndef USE_AS_WMEMCMP +# ifndef USE_AS_WMEMCMP cmp %cx, %ax jne L(diffin2bytes) shr $16, %ecx @@ -1589,7 +2311,7 @@ L(end): and $0xff, %ecx sub %ecx, %eax ret -# else +# else /* for wmemcmp */ mov $1, %eax @@ -1601,6 +2323,15 @@ L(end): L(nequal_bigger): ret +L(unreal_case): + xor %eax, %eax + ret +# endif +# else + .p2align 4 +L(return_not_equals): + mov $1, %eax + ret L(unreal_case): xor %eax, %eax ret