// file kernel/n/x86-64/sqrt_n2.S: O(n^2) square root of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                          Racine carre quadratique                    |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                             # +-----------------+
                             # |  Racine carre  |
                             # +-----------------+

        
# void xn(sqrt_n2)(chiffre *a, long la, chiffre *b)
#
# entre :
# a = naturel de longueur la
# b = naturel de longueur la/2
#
# contraintes :
# la > 0, la pair, BASE/16 <= a[la-1] < BASE/4
# a,b non confondus
#
# sortie :
# b <- 2*floor(sqrt(a))
# a <- a - b^2/4

#ifdef assembly_sn_sqrt_n2
#undef L
#define L(x) .Lsn_fsqrt_n2_##x

#ifdef debug_sqrt_n2
ENTER(sn_sqrt_n2_buggy)
#else
ENTER(sn_sqrt_n2)
#endif

	# initialise les registres
        movq   %rsi,    %r10            # sauve la
        leaq  -16(%rdi,%rsi,8), %rdi    # rdi <- &a[la-2]
        leaq     (%rdx,%rsi,4), %rsi    # rsi <- &b[la/2]
	xorq   %r11,    %r11            # lb  <- 0
	xorq   %rbx,    %rbx            # rbx <- 0 (pour les retenues)

        # b[0] <- 2*floor(sqrt(a[0]+BASE*a[1])), a <- a - b^2/4
        leaq  -1(%rbx), %rbp            # rbp <- u = BASE/2 - 1
	shrq   $1,      %rbp
        xorq   %rax,    %rax            # init retenue
        ALIGN(8)
1:
        sarq   $1,      %rax            # u <- (u + x/u)/2
        addq   %rax,    %rbp
        movq   (%rdi),  %rax            # rax <- x/u
        movq   8(%rdi), %rdx
        divq   %rbp
        subq   %rbp,    %rax
        jb     1b
        # fin de la boucle de Newton, rbp = u = floor(sqrt(x))
        # rax = q = floor(x/u), rdx = r = x - q*u
        # on a q = u ou u+1 ou u+2 et on veut x - u^2 = r ou r+u ou r+2u
        je     2f
        addq   %rbp,    %rdx            # r += u
        decq   %rax
        je     2f
        addq   %rbp,    %rdx            # r += u
2:
        movq   %rdx,    (%rdi)          # a[1]:a[0] <- x - u^2
        movq   $0,     8(%rdi)
        shlq   $1,       %rbp
        movq   %rbp,  -8(%rsi)          # b[0] <- 2u
	jmp   L(next)

        # calcule les chiffres suivants par divisions
        ALIGN(8)
L(loop):
	subq   $23,     %r12            # mise  jour adresse de saut
	leaq   .Lsn_mul_sub_loop+7*23(%rip), %rax
	testq  $7,      %r11
	cmovz  %rax,    %r12
	decq   %r11                     # lb++
	movq   %r11,    %rcx
	andq   $-8,     %rcx            # rcx <- -8*ceil(lb/8)
	
        # quotient approch, peut tre trop grand d une ou deux units
        # q <- max(floor(a[lb]:a[lb]/b[lb-1]), BASE-1)
	movq   (%rdi),  %rdx            # rdx <- a[lb]
	movq -8(%rsi),  %rbp            # rbp <- b[lb-1]
	leaq -1(%rbx),  %rax            # rax <- BASE-1
        cmpq   %rbp,    %rdx
	leaq  -8(%rdi), %rdi            # a -= 2
        jnb    1f
	movq   (%rdi),  %rax            # rdx:rax <- a[lb]:a[lb-1]
        divq   %rbp
1:
        movq   %rax,    %rbp           # q <- quotient

        # a <- a - v*b - v^2
        movq   %rax, -8(%rsi,%r11,8)    # b[0] <- v
	mulq   %rbp                     # rdx:rax <- v^2
	movq   %rax,    %r8             # init retenues
	movq   %rax,    %r9
	call   *%r12                    # a -= b*v + v^2
        shlq   $1,      %rbp
	movq   %rbp, -8(%rsi,%r11,8)
	adcq   %rcx,   (%rsi,%r11,8)
        subq   %r9,    (%rdi)           # retranche les derniers chiffres
	adcq   %rbx,    %rdx
	setc   %bl                      # rbx <- 0
	subq   %rdx,  8(%rdi)
        jnb    L(next)

        # corrige le quotient et le reste si < 0
L(corr):
        subq   $1,   -8(%rsi,%r11,8)    # b <- b-1
        sbbq   $0,     (%rsi,%r11,8)
	movq   %r11,    %rcx
	ALIGN(8)
L(add):
	movq  -8(%rsi,%rcx,8), %rax     # a <- a + b
	adcq   %rax,  (%rdi,%rcx,8)
	incq   %rcx
	jle    L(add)
        adcq   %rbx,  8(%rdi)           # dernire retenue
        decq -8(%rsi,%r11,8)            # b <- b-1
        jnc    L(corr)

        # chiffre suivant
L(next):
	subq   $2,      %r10
        jne    L(loop)
        RETURN_WITH_SP
        
#endif /* assembly_sn_sqrt_n2 */

