// file kernel/n/alpha/gcd_n2.S: greatest common divisor
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                                   PGCD                                |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                       # +------------------------+
                       # |  Pgcd  deux chiffres  |
                       # +------------------------+ */
 
   # void xn(gcd_2)(chiffre *x)
   # entre :
   #   x = tableau de 8 chiffres [a0,a1,b0,b1,p,s,q,r]
 #   a = a0 + BASE*a1, b = b0 + BASE*b1
   # 
   # contrainte : 0 < b < a
   # 
   # Dveloppe en fraction continue la fraction a/b tant que les coefficients
   # tiennent sur un chiffre
   # 
   # sortie :
   #   [a0,a1,b0,b1] <- ind.
   #   [p,s,q,r]     <- coefficients des combinaisons effectues

#ifdef assembly_sn_gcd_2
#define L(x) .Lsn_gcd_2_##x

	#define _a0_ $0
	#define _a1_ $1
	#define _b0_ $2
	#define _b1_ $3
	#define _p_  $4
	#define _q_  $5
	#define _r_  $6
	#define _s_  $7
	#define _i_  $8
	#define _t_  $17
	#define _u_  $18
	#define _v_  $19
	#define _w_  $20
	
        .align 5
        .globl sn_gcd_2
        .ent   sn_gcd_2
sn_gcd_2:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)

	# [p,q,r,s] <- Id
	addq   $31, 1,   _p_
	bis    $31, $31, _q_
	bis    $31, $31, _r_
	addq   $31, 1,   _s_

	# a <- a, b <- b
	ldq    _a0_,  0($16)
	ldq    _a1_,  8($16)
	ldq    _b0_, 16($16)
	ldq    _b1_, 24($16)

	# init compteur de dcalage
	bis    $31,  $31,  _i_
	
	# Ici a >= b. Dcale b,p,r tant que a >= 2*b
	cmpult _a0_, _b0_, _u_
	addq   _b1_, _u_,  _u_
	subq   _a0_, _b0_, _a0_
	subq   _a1_, _u_,  _a1_
	br     $31,  L(loop_b)

	.align 5
L(shift_b):
	subq   _a0_, _b0_, _a0_
	subq   _a1_, _u_,  _a1_
	cmplt  _b0_, $31,  _u_
	addq   _b0_, _b0_, _b0_
	addq   _b1_, _b1_, _b1_
	addq   _b1_, _u_,  _b1_
	or     _p_,  _r_,  _u_
	blt    _u_,  L(shift_pr)
	addq   _p_,  _p_,  _p_
	addq   _r_,  _r_,  _r_
	addq   _i_,  1,    _i_
L(loop_b):
	cmpult _a0_, _b0_, _u_
	addq   _b1_, _u_,  _u_
	cmpult _a1_, _u_,  _v_
	beq    _v_,  L(shift_b)

	# Ici a0:a1 contient a - 2^i*b,
        #     b0:b1 contient 2^i*b
	#     p     contient 2^i*p
	#     r     contient 2^i*r
        # Calcule a/b par soustractions et dcalages et sort si q ou s dborde
	br      $31,  L(add_qs_pr)
	.align  5
L(div_a):
	sll     _b1_, 63,   _u_
	srl     _b1_, 1,    _b1_
	srl     _b0_, 1,    _b0_
	addq    _b0_, _u_,  _b0_
	srl     _p_,  1,    _p_
	srl     _r_,  1,    _r_
	subq    _i_,  1,    _i_
	cmpult  _a0_, _b0_, _u_
	addq    _b1_, _u_,  _u_
	cmpult  _a1_, _u_,  _v_
	bne     _v_,  L(next_a)
	subq    _a0_, _b0_, _a0_
	subq    _a1_, _u_,  _a1_
L(add_qs_pr):
	addq    _q_,  _p_,  _u_
	addq    _s_,  _r_,  _v_
	cmpult  _u_,  _p_,  _w_
	cmpult  _v_,  _r_,  _t_
	or      _w_,  _t_,  _w_
	bne     _w_,  L(shift_pr)
	bis     _u_,  _u_,  _q_
	bis     _v_,  _v_,  _s_
L(next_a):
	bne     _i_,  L(div_a)

	# Fin de la division de a par b.
        # Ici a0:a1 contient le reste, b,p,r ont t restaurs et q,s mis  jour.
        # si a = 0, c est termin
	or     _a0_, _a1_, _u_
	beq    _u_,  L(shift_pr)
	cmpult _b0_, _a0_, _u_
	addq   _a1_, _u_,  _u_
	subq   _b0_, _a0_, _b0_
	subq   _b1_, _u_,  _b1_
	br     $31,  L(loop_a)

	# sauve p,q,r,s
L(shift_pr):
	srl   _p_,  _i_,  _p_
	srl   _r_,  _i_,  _r_
	stq   _p_,  32($16)
	stq   _q_,  48($16)
	stq   _r_,  56($16)
	stq   _s_,  40($16)
	ret    $31,  ($26),1
	
	# Ici b >= a. Dcale a,s,q tant que b >= 2*a
	.align 5
L(shift_a):
	subq   _b0_, _a0_, _b0_
	subq   _b1_, _u_,  _b1_
	cmplt  _a0_, $31,  _u_
	addq   _a0_, _a0_, _a0_
	addq   _a1_, _a1_, _a1_
	addq   _a1_, _u_,  _a1_
	or     _s_,  _q_,  _u_
	blt    _u_,  L(shift_qs)
	addq   _s_,  _s_,  _s_
	addq   _q_,  _q_,  _q_
	addq   _i_,  1,    _i_
L(loop_a):
	cmpult _b0_, _a0_, _u_
	addq   _a1_, _u_,  _u_
	cmpult _b1_, _u_,  _v_
	beq    _v_,  L(shift_a)

	# Ici b0:b1 contient b - 2^i*a,
        #     a0:a1 contient 2^i*a
	#     s     contient 2^i*s
	#     q     contient 2^i*q
        # Calcule b/a par soustractions et dcalages et sort si r ou p dborde
	br      $31,  L(add_pr_qs)
	.align  5
L(div_b):
	sll     _a1_, 63,   _u_
	srl     _a1_, 1,    _a1_
	srl     _a0_, 1,    _a0_
	addq    _a0_, _u_,  _a0_
	srl     _s_,  1,    _s_
	srl     _q_,  1,    _q_
	subq    _i_,  1,    _i_
	cmpult  _b0_, _a0_, _u_
	addq    _a1_, _u_,  _u_
	cmpult  _b1_, _u_,  _v_
	bne     _v_,  L(next_b)
	subq    _b0_, _a0_, _b0_
	subq    _b1_, _u_,  _b1_
L(add_pr_qs):
	addq    _r_,  _s_,  _u_
	addq    _p_,  _q_,  _v_
	cmpult  _u_,  _s_,  _w_
	cmpult  _v_,  _q_,  _t_
	or      _w_,  _t_,  _w_
	bne     _w_, L(shift_qs)
	bis     _u_,  _u_,  _r_
	bis     _v_,  _v_,  _p_
L(next_b):
	bne     _i_,  L(div_b)

	# Fin de la division de b par a.
        # Ici b0:b1 contient le reste, a,s,q ont t restaurs et r,p mis  jour.
        # si b = 0, c est termin
	or     _b0_, _b1_, _u_
	beq    _u_,  L(shift_qs)
	cmpult _a0_, _b0_, _u_
	addq   _b1_, _u_,  _u_
	subq   _a0_, _b0_, _a0_
	subq   _a1_, _u_,  _a1_
	br     $31,  L(loop_b)

	# sauve p,q,r,s
L(shift_qs):
	srl   _q_,  _i_,  _q_
	srl   _s_,  _i_,  _s_
	stq   _p_,  32($16)
	stq   _q_,  48($16)
	stq   _r_,  56($16)
	stq   _s_,  40($16)
	ret    $31,  ($26),1
	
        .end   sn_gcd_2
	
	#undef _a0_
	#undef _a1_
	#undef _b0_
	#undef _b1_
	#undef _p_ 
	#undef _q_ 
	#undef _r_ 
	#undef _s_ 
	#undef _i_ 
	#undef _t_ 
	#undef _u_ 
	#undef _v_ 
	#undef _w_ 
	#undef _w_ 
#undef L
#endif /* assembly_sn_gcd_2 */

                     # +-----------------------------+
                     # |  Demi-pgcd  deux chiffres  |
                     # +-----------------------------+ */
 
   # void xn(hgcd_2)(chiffre *x)
   # entre :
   #   x = tableau de 8 chiffres [a0,a1,b0,b1,p,s,q,r]
   #   a = a0 + BASE*a1, b = b0 + BASE*b1
   # 
   # contrainte : 0 < b < a
   # 
   # Dveloppe en fraction continue les fractions a/(b+1) et (a+1)/b
   # tant que les quotients concident et que les coefficients tiennent
   # sur un chiffre
   # 
   # sortie :
   #   [a0,a1,b0,b1] <- ind.
   #   [p,s,q,r]     <- coefficients des combinaisons effectues

#ifdef assembly_sn_hgcd_2
#define L(x) .Lsn_hgcd_2_##x

	#define _a0_ $0
	#define _a1_ $1
	#define _b0_ $2
	#define _b1_ $3
	#define _p_  $4
	#define _q_  $5
	#define _r_  $6
	#define _s_  $7
	#define _i_  $8
	#define _t_  $17
	#define _u_  $18
	#define _v_  $19
	#define _w_  $20
	
        .align 5
        .globl sn_hgcd_2
        .ent   sn_hgcd_2
sn_hgcd_2:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)

	# [p,q,r,s] <- Id
	addq   $31, 1,   _p_
	bis    $31, $31, _q_
	bis    $31, $31, _r_
	addq   $31, 1,   _s_

	# a <- a-q, b <- b
	ldq    _a0_,  0($16)
	ldq    _a1_,  8($16)
	ldq    _b0_, 16($16)
	ldq    _b1_, 24($16)
	addq   _b0_, 1,   _b0_
	cmpeq  _b0_, $31, _u_
	addq   _b1_, _u_, _b1_

	# init compteur de dcalage
	bis    $31,  $31,  _i_
	
	# Ici a-q >= b+p. Dcale b,p,r tant que a-q >= 2*(b+p)
	cmpult _a0_, _b0_, _u_
	addq   _b1_, _u_,  _u_
	subq   _a0_, _b0_, _a0_
	subq   _a1_, _u_,  _a1_
	br     $31,  L(loop_b)

	.align 5
L(shift_b):
	subq   _a0_, _b0_, _a0_
	subq   _a1_, _u_,  _a1_
	cmplt  _b0_, $31,  _u_
	addq   _b0_, _b0_, _b0_
	addq   _b1_, _b1_, _b1_
	addq   _b1_, _u_,  _b1_
	or     _p_,  _r_,  _u_
	blt    _u_,  L(shift_pr)
	addq   _p_,  _p_,  _p_
	addq   _r_,  _r_,  _r_
	addq   _i_,  1,    _i_
L(loop_b):
	cmpult _a0_, _b0_, _u_
	addq   _b1_, _u_,  _u_
	cmpult _a1_, _u_,  _v_
	beq    _v_,  L(shift_b)

	# Ici a0:a1 contient a-q - 2^i*(b+p),
        #     b0:b1 contient 2^i*(b+p)
	#     p     contient 2^i*p
	#     r     contient 2^i*r
        # Calcule (a-q)/(b+p) par soustractions et dcalages et sort
	# si q ou s dborde
	br      $31,  L(add_qs_pr)
	.align 5
L(div_a):
	sll     _b1_, 63,   _u_
	srl     _b1_, 1,    _b1_
	srl     _b0_, 1,    _b0_
	addq    _b0_, _u_,  _b0_
	srl     _p_,  1,    _p_
	srl     _r_,  1,    _r_
	subq    _i_,  1,    _i_
	cmpult  _a0_, _b0_, _u_
	addq    _b1_, _u_,  _u_
	cmpult  _a1_, _u_,  _v_
	bne     _v_,  L(next_a)
	subq    _a0_, _b0_, _a0_
	subq    _a1_, _u_,  _a1_
L(add_qs_pr):
	addq    _q_,  _p_,  _u_
	addq    _s_,  _r_,  _v_
	cmpult  _u_,  _p_,  _w_
	cmpult  _v_,  _r_,  _t_
	or      _w_,  _t_,  _w_
	bne     _w_,  L(shift_pr)
	bis     _u_,  _u_,  _q_
	bis     _v_,  _v_,  _s_
L(next_a):
	bne     _i_,  L(div_a)
	addq    _a0_, _q_,  _a0_
	cmpult  _a0_, _q_,  _u_
	addq    _a1_, _u_,  _a1_
	cmpult  _b0_, _p_,  _u_
	subq    _b0_, _p_,  _b0_
	subq    _b1_, _u_,  _b1_

	# Fin de la division de a-q par b+p.
        # Ici a0:a1 contient le reste, b,p,r ont t restaurs et q,s mis  jour.
        # si a+s > b-r, c est termin
	addq   _a0_, _s_,  _a0_
	cmpult _a0_, _s_,  _u_
	addq   _a1_, _u_,  _a1_
	cmpult _b0_, _r_,  _u_
	subq   _b0_, _r_,  _b0_
	subq   _b1_, _u_,  _b1_
	cmpult _b0_, _a0_, _u_
	addq   _a1_, _u_,  _u_
	cmpult _b1_, _u_,  _v_
	subq   _b0_, _a0_, _b0_
	subq   _b1_, _u_,  _b1_
	beq    _v_,  L(loop_a)

	# sauve p,q,r,s
L(shift_pr):
	srl   _p_,  _i_,  _p_
	srl   _r_,  _i_,  _r_
	stq   _p_,  32($16)
	stq   _q_,  48($16)
	stq   _r_,  56($16)
	stq   _s_,  40($16)
	ret    $31,  ($26),1
	
	# Ici b-r >= a+s. Dcale a,s,q tant que b-r >= 2*(a+s)
	.align 5
L(shift_a):
	subq   _b0_, _a0_, _b0_
	subq   _b1_, _u_,  _b1_
	cmplt  _a0_, $31,  _u_
	addq   _a0_, _a0_, _a0_
	addq   _a1_, _a1_, _a1_
	addq   _a1_, _u_,  _a1_
	or     _s_,  _q_,  _u_
	blt    _u_,  L(shift_qs)
	addq   _s_,  _s_,  _s_
	addq   _q_,  _q_,  _q_
	addq   _i_,  1,    _i_
L(loop_a):
	cmpult _b0_, _a0_, _u_
	addq   _a1_, _u_,  _u_
	cmpult _b1_, _u_,  _v_
	beq    _v_,  L(shift_a)

	# Ici b0:b1 contient b-r - 2^i*(a+s),
        #     a0:a1 contient 2^i*(a+s)
	#     s     contient 2^i*s
	#     q     contient 2^i*q
        # Calcule (b-r)/(a+s) par soustractions et dcalages et sort
	# si r ou p dborde
	br      $31,  L(add_pr_qs)
	.align  5
L(div_b):
	sll     _a1_, 63,   _u_
	srl     _a1_, 1,    _a1_
	srl     _a0_, 1,    _a0_
	addq    _a0_, _u_,  _a0_
	srl     _s_,  1,    _s_
	srl     _q_,  1,    _q_
	subq    _i_,  1,    _i_
	cmpult  _b0_, _a0_, _u_
	addq    _a1_, _u_,  _u_
	cmpult  _b1_, _u_,  _v_
	bne     _v_,  L(next_b)
	subq    _b0_, _a0_, _b0_
	subq    _b1_, _u_,  _b1_
L(add_pr_qs):
	addq    _r_,  _s_,  _u_
	addq    _p_,  _q_,  _v_
	cmpult  _u_,  _s_,  _w_
	cmpult  _v_,  _q_,  _t_
	or      _w_,  _t_,  _w_
	bne     _w_,  L(shift_qs)
	bis     _u_,  _u_,  _r_
	bis     _v_,  _v_,  _p_
L(next_b):
	bne     _i_,  L(div_b)
	addq    _b0_, _r_,  _b0_
	cmpult  _b0_, _r_,  _u_
	addq    _b1_, _u_,  _b1_
	cmpult  _a0_, _s_,  _u_
	subq    _a0_, _s_,  _a0_
	subq    _a1_, _u_,  _a1_

	# Fin de la division de b-r par a+s.
        # Ici b0:b1 contient le reste, a,s,q ont t restaurs et r,p mis  jour.
        # si b+p > a-q, c est termin
	addq   _b0_, _p_,  _b0_
	cmpult _b0_, _p_,  _u_
	addq   _b1_, _u_,  _b1_
	cmpult _a0_, _q_,  _u_
	subq   _a0_, _q_,  _a0_
	subq   _a1_, _u_,  _a1_
	cmpult _a0_, _b0_, _u_
	addq   _b1_, _u_,  _u_
	cmpult _a1_, _u_,  _v_
	subq   _a0_, _b0_, _a0_
	subq   _a1_, _u_,  _a1_
	beq    _v_,  L(loop_b)

	# sauve p,q,r,s
L(shift_qs):
	srl   _q_,  _i_,  _q_
	srl   _s_,  _i_,  _s_
	stq   _p_,  32($16)
	stq   _q_,  48($16)
	stq   _r_,  56($16)
	stq   _s_,  40($16)
	ret    $31,  ($26),1
	
        .end   sn_hgcd_2
	
	#undef _a0_
	#undef _a1_
	#undef _b0_
	#undef _b1_
	#undef _p_ 
	#undef _q_ 
	#undef _r_ 
	#undef _s_ 
	#undef _i_ 
	#undef _t_ 
	#undef _u_ 
	#undef _v_ 
	#undef _w_ 
	#undef _w_ 
#undef L
#endif /* assembly_sn_hgcd_2 */
