
    Ugh                     D    d Z ddlZddlmZ ddlmZ  G d d          ZdS )zA
Loss functions for linear models with raw_prediction = X @ coef
    N)sparse   )squared_normc                       e Zd ZdZd ZddZd Zd Zd Z	 	 	 	 dd
Z		 	 	 	 ddZ
	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 ddZdS )LinearModelLossa  General class for loss functions with raw_prediction = X @ coef + intercept.

    Note that raw_prediction is also known as linear predictor.

    The loss is the average of per sample losses and includes a term for L2
    regularization::

        loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept)
               + 1/2 * l2_reg_strength * ||coef||_2^2

    with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i.

    Gradient and hessian, for simplicity without intercept, are::

        gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef
        hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X
                  + l2_reg_strength * identity

    Conventions:
        if fit_intercept:
            n_dof =  n_features + 1
        else:
            n_dof = n_features

        if base_loss.is_multiclass:
            coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
        else:
            coef.shape = (n_dof,)

        The intercept term is at the end of the coef array:
        if base_loss.is_multiclass:
            if coef.shape (n_classes, n_dof):
                intercept = coef[:, -1]
            if coef.shape (n_classes * n_dof,)
                intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
            intercept.shape = (n_classes,)
        else:
            intercept = coef[-1]

    Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as

        coef.reshape((n_classes, -1), order="F")

    The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
    coefficients without intercept, coef[:, :-1], contiguous and speeds up
    matrix-vector computations.

    Note: If the average loss per sample is wanted instead of the sum of the loss per
    sample, one can simply use a rescaled sample_weight such that
    sum(sample_weight) = 1.

    Parameters
    ----------
    base_loss : instance of class BaseLoss from sklearn._loss.
    fit_intercept : bool
    c                 "    || _         || _        d S N)	base_lossfit_intercept)selfr
   r   s      `/var/www/surfInsights/venv3-11/lib/python3.11/site-packages/sklearn/linear_model/_linear_loss.py__init__zLinearModelLoss.__init__E   s    "*    Nc                     |j         d         }| j        j        }| j        r|dz   }n|}| j        j        rt          j        |||f|d          }nt          j        |||          }|S )a  Allocate coef of correct shape with zeros.

        Parameters:
        -----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        dtype : data-type, default=None
            Overrides the data type of coef. With dtype=None, coef will have the same
            dtype as X.

        Returns
        -------
        coef : ndarray of shape (n_dof,) or (n_classes, n_dof)
            Coefficients of a linear model.
           F)shapedtypeorderr   r   )r   r
   	n_classesr   is_multiclassnp
zeros_like)r   Xr   
n_featuresr   n_dofcoefs          r   init_zero_coefzLinearModelLoss.init_zero_coefI   s|      WQZ
N,	 	NEEE>' 	>=9e*<EQTUUUDD=%u===Dr   c                 
   | j         j        s| j        r|d         }|dd         }nZd}|}nU|j        dk    r$|                    | j         j        dfd          }n|}| j        r|dddf         }|ddddf         }nd}||fS )a  Helper function to get coefficients and intercept.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").

        Returns
        -------
        weights : ndarray of shape (n_features,) or (n_classes, n_features)
            Coefficients without intercept term.
        intercept : float or ndarray of shape (n_classes,)
            Intercept terms.
        N        r   r   r   )r
   r   r   ndimreshaper   )r   r   	interceptweightss       r   weight_interceptz LinearModelLoss.weight_intercepte   s    $ ~+ 	 !  H	ss)	 yA~~,,(@"'ES,QQ!  #AAArEN	!!!!SbS&/		!!r   c                     |                      |          \  }}| j        j        s	||z  |z   }n||j        z  |z   }|||fS )ai  Helper function to get coefficients, intercept and raw_prediction.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.

        Returns
        -------
        weights : ndarray of shape (n_features,) or (n_classes, n_features)
            Coefficients without intercept term.
        intercept : float or ndarray of shape (n_classes,)
            Intercept terms.
        raw_prediction : ndarray of shape (n_samples,) or             (n_samples, n_classes)
        )r(   r
   r   T)r   r   r   r'   r&   raw_predictions         r   weight_intercept_rawz$LinearModelLoss.weight_intercept_raw   sX    , "22488~+ 	7[94NN ]Y6N	>11r   c                 P    |j         dk    r||z  nt          |          }d|z  |z  S )z5Compute L2 penalty term l2_reg_strength/2 *||w||_2^2.r   g      ?)r$   r   )r   r'   l2_reg_strengthnorm2_ws       r   
l2_penaltyzLinearModelLoss.l2_penalty   s5    '.|q'8'8'G##l7>S>S_$w..r   r"   r   c                    ||                      ||          \  }}	}n|                     |          \  }}	| j                            ||d|          }
t	          j        |
|          }
|
|                     ||          z   S )a  Compute the loss as weighted average over point-wise losses.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        loss : float
            Weighted average of losses per sample, plus penalty.
        Ny_truer+   sample_weight	n_threads)r'   )r,   r(   r
   lossr   averager0   )r   r   r   yr4   r.   r5   r+   r'   r&   r6   s              r   r6   zLinearModelLoss.loss   s    N !151J1J4QR1S1S.GY!%!6!6t!<!<GY~"")	 # 
 
 z$666doog????r   c                 Z   |j         | j        j        c\  }}	}
|	t          | j                  z   }||                     ||          \  }}}n|                     |          \  }}| j                            ||||          \  }}||nt          j	        |          }|	                                |z  }|| 
                    ||          z  }||z  }| j        j        sOt          j        ||j                  }|j        |z  ||z  z   |d|	<   | j        r|	                                |d<   n|t          j        |
|f|j        d          }|j        |z  ||z  z   |ddd|	f<   | j        r|	                    d          |dddf<   |j        d	k    r|                    d
          }||fS )a\  Computes the sum of loss and gradient w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        loss : float
            Weighted average of losses per sample, plus penalty.

        gradient : ndarray of shape coef.shape
             The gradient of the loss.
        Nr2   r   r!   r   r   r   r   axisr   r#   )r   r
   r   intr   r,   r(   loss_gradientr   sumr0   r   
empty_liker   r*   emptyr$   ravel)r   r   r   r8   r4   r.   r5   r+   	n_samplesr   r   r   r'   r&   r6   grad_pointwisesw_sumgrads                     r   r?   zLinearModelLoss.loss_gradient   s   T ./Wdn6N*JS!3444!151J1J4QR1S1S.GY!%!6!6t!<!<GY#~;;)'	  <  
  
n ,39N9NxxzzF"999& ~+ 	-=W];;;D !n 47P PD*! 0)--//R8Y.gm3OOOD#1#3a#7/G:S#SDKZK ! 9,00a088QQQUyA~~zzz,,Tzr   c                    |j         | j        j        c\  }}	}
|	t          | j                  z   }||                     ||          \  }}}n|                     |          \  }}| j                            ||||          }||nt          j	        |          }||z  }| j        j
        sPt          j        ||j                  }|j        |z  ||z  z   |d|	<   | j        r|	                                |d<   |S t          j        |
|f|j        d          }|j        |z  ||z  z   |ddd|	f<   | j        r|	                    d          |dddf<   |j        d	k    r|                    d
          S |S )a  Computes the gradient w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.
        Nr2   r:   r!   r   r;   r   r<   r   r#   )r   r
   r   r>   r   r,   r(   gradientr   r@   r   rA   r   r*   rB   r$   rC   )r   r   r   r8   r4   r.   r5   r+   rD   r   r   r   r'   r&   rE   rF   rG   s                    r   rI   zLinearModelLoss.gradient5  s   N ./Wdn6N*JS!3444!151J1J4QR1S1S.GY!%!6!6t!<!<GY00)'	 1 
 
 ,39N9N& ~+ 	=W];;;D !n 47P PD*! 0)--//RK8Y.gm3OOOD#1#3a#7/G:S#SDKZK ! 9,00a088QQQUyA~~zzz,,,r   c
                    |j         \  }
}|t          | j                  z   }|	|                     ||          \  }}}	n|                     |          \  }}| j                            ||	||          \  }}||
nt          j        |          }||z  }||z  }t          j	        |dk              dk    }t          j
        |          }| j        j        sp|t          j        ||j                  }n|}|j        |z  ||z  z   |d|<   | j        r|                                |d<   |t          j        ||f|j                  }n|}|r|||fS t!          j        |          rA|j        t!          j        |df|
|
f          z  |z                                  |d|d|f<   n2|dddf         |z  }t          j        |j        |          |d|d|f<   |dk    r,|                    d          d||z  |d	z   xx         |z  cc<   | j        r3|j        |z  }||dddf<   ||dddf<   |                                |d
<   nt,          |||fS )a  Computes gradient and hessian w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        gradient_out : None or ndarray of shape coef.shape
            A location into which the gradient is stored. If None, a new array
            might be created.
        hessian_out : None or ndarray
            A location into which the hessian is stored. If None, a new array
            might be created.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.

        hessian : ndarray
            Hessian matrix.

        hessian_warning : bool
            True if pointwise hessian has more than half of its elements non-positive.
        Nr2   r   g      ?r:   r!   r   r   r   )r!   r!   )r   r>   r   r,   r(   r
   gradient_hessianr   r@   meanabsr   rA   r   r*   rB   r   issparse
dia_matrixtoarraydotr%   NotImplementedError)r   r   r   r8   r4   r.   r5   gradient_outhessian_outr+   rD   r   r   r'   r&   rE   hess_pointwiserF   hessian_warningrG   hessWXXhs                          r   rL   z LinearModelLoss.gradient_hessian~  s   j !"	:S!3444!151J1J4QR1S1S.GY!%!6!6t!<!<GY)-)H)H)'	 *I *
 *
& ,39N9N& & 
 '.A"566=//~+ :	&#}T???# !n 47P PD*! 0)--//R "xuenGMJJJ" 3T?22
 q!! AC''+Iy3I   	
 ')) [j[+:+-.. $AAAtG,q013R[j[+:+-."" R  8zE)eai8  $%    ! 
4 S>) "SbS"W "R"W-1133V &%T?**r   c                     j          j        j        c\  }t           j                  z                                  \  }}	|nt          j                   j        j        s9 j        	                    ||	|          \  }
}|
z  }
|z  }t          j
        j                  }j        |
z  z  z   |d<    j        r|
                                |d<   |                                t          j                  rt          j        |df||f          z  n|ddt          j        f         z   j        rNt          j        t          j                            d                              t          j                   fd}n j                            ||	|          \  }
|
z  }
t          j        fj        d	
          }|
j        z  z  z   |dddf<    j        r|
                    d          |dddf<    fd}j        dk    r|                    d	          |fS ||fS )a  Computes gradient and hessp (hessian product function) w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.

        hessp : callable
            Function that takes in a vector input of shape of gradient and
            and returns matrix-vector product with hessian.
        Nr2   r:   r!   r   rK   r<   c                    t          j        |           }t          j                  rj        | d          z  z  |d <   n4t           j                            j        | d          g          |d <   |d xx         | d          z  z  cc<   j        r7|d xx         | d         z  z  cc<   | d          z  | d         z  z   |d<   |S )Nr!   )r   rA   r   rO   r*   linalg	multi_dotr   )	sretr   hXhX_sumhessian_sumr.   r   r   s	     r   hesspz7LinearModelLoss.gradient_hessian_product.<locals>.hesspR  s   mA&&?1%% V'(sb1[j[>.A'BC$$')y':':ACQ{
{^;T'U'UC$KZK   Oan$DD   % L$$$"6$$$$q*~5ae8KKCG
r   r   r;   c                 B   |                      dfd          } j        r| d d df         }| d d d df         } nd}| j        z  |z   }|
 |z                      d          d d t          j        f         z  }|
z  }|d d t          j        f         z  }t	          j        fj        d          }|j        z  z  | z  z   |d d d 	f<   j        r |                    d          z  |d d df<   j        dk    r|	                    d          S |S )Nr!   r   r#   r   r   r<   r;   )
r%   r   r*   r@   r   newaxisrB   r   r$   rC   )r_   s_intercepttmp	hess_prodr   r   r.   r   r   r   probar4   r   rF   r'   s       r   rd   z7LinearModelLoss.gradient_hessian_product.<locals>.hessp  s\   IIy"oSI99% $"#AAArE(K!!!SbS&	AA"#K!#g+))q)11!!!RZ-@@u ,=BJ77C Hi%7w}TWXXX	-0UQY&,@?UVCV,V	!!![j[.)% @'*wwAw'?Iaaae$9>>$???555$$r   r   r#   )r   r
   r   r>   r   r,   r   r@   r   rL   rA   r   r*   r   rO   rP   rf   squeezeasarray
atleast_1dgradient_probarB   r$   rC   )r   r   r   r8   r4   r.   r5   rD   r&   r+   rE   rV   rG   rd   ra   rb   rc   r   r   r   rj   rF   r'   s   ``` ``        @@@@@@@@@r   gradient_hessian_productz(LinearModelLoss.gradient_hessian_product	  s-   @ ./Wdn6N*JS!3444-1-F-FtQ-O-O*N+39N9N~+ p	4-1^-L-L-+#	 .M . .*NN f$Nf$N=W];;;D !n 47P PD*! 0)--//R ),,..Kq!! 7%~q&9)YAWXXX 
 $AAArzM2Q6! / BJrvv1v~~$>$>??v..           & %)N$A$A-+#	 %B % %!NE f$N8Y.gm3OOOD#1#3a#7/G:S#SDKZK ! 9,00a088QQQU.% % % % % % % % % % % % % % %. yA~~zzz,,e33U{r   r	   )Nr"   r   N)Nr"   r   NNN)Nr"   r   )__name__
__module____qualname____doc__r   r   r(   r,   r0   r6   r?   rI   rL   ro    r   r   r   r      s4       7 7r+ + +   8%" %" %"N2 2 2@/ / / 4@ 4@ 4@ 4@v L L L Lf G G G G\ I+ I+ I+ I+X NOW W W W W Wr   r   )rs   numpyr   scipyr   utils.extmathr   r   rt   r   r   <module>rx      s}               ( ( ( ( ( (U
 U
 U
 U
 U
 U
 U
 U
 U
 U
r   