
    Ug{,                         d dl mZ d dlmZ d dlmZ d dlZddlm	Z	 dddd	Z
dd
Z G d de          Zd Z G d de          Zd Zd ZdddZddZ G d de          Zd ZdS )    )Counter)suppress)
NamedTupleN   is_scalar_nanFreturn_inversereturn_countsc                j    | j         t          k    rt          | ||          S t          | ||          S )a  Helper function to find unique values with support for python objects.

    Uses pure python method for object dtype, and numpy method for
    all other dtypes.

    Parameters
    ----------
    values : ndarray
        Values to check for unknowns.

    return_inverse : bool, default=False
        If True, also return the indices of the unique values.

    return_counts : bool, default=False
        If True, also return the number of times each unique item appears in
        values.

    Returns
    -------
    unique : ndarray
        The sorted unique values.

    unique_inverse : ndarray
        The indices to reconstruct the original array from the unique array.
        Only provided if `return_inverse` is True.

    unique_counts : ndarray
        The number of times each of the unique values comes up in the original
        array. Only provided if `return_counts` is True.
    r	   )dtypeobject_unique_python
_unique_np)valuesr
   r   s      T/var/www/surfInsights/venv3-11/lib/python3.11/site-packages/sklearn/utils/_encode.py_uniquer   
   sO    > |v>
 
 
 	
 ~]       c                    t          j        | ||          }d\  }}|r|^ }}|r|^ }}|s|r|d         }|j        rzt          |d                   ret          j        |t           j                  }|d|dz            }|r	||||k    <   |r,t          j        ||d                   ||<   |d|dz            }|f}|r||fz  }|r||fz  }t          |          dk    r|d         n|S )zHelper function to find unique values for numpy arrays that correctly
    accounts for nans. See `_unique` documentation for details.r	   )NNr   Nr   )npuniquesizer   searchsortednansumlen)r   r
   r   uniquesinversecountsnan_idxrets           r   r   r   3   sD    i~]  G !OGV #"& $#'  !* | +gbk22 +/'2622-GaK-( 	1)0GGg%& 	+ fVGHH%566F7OMgkM*F*C z yXX]]3q66+r   c                   .    e Zd ZU dZeed<   eed<   d ZdS )MissingValuesz'Data class for missing data informationr   nonec                     g }| j         r|                    d           | j        r|                    t          j                   |S )z3Convert tuple to a list where None is always first.N)r%   appendr   r   )selfoutputs     r   to_listzMissingValues.to_listb   sF    9 	 MM$8 	"MM"&!!!r   N)__name__
__module____qualname____doc__bool__annotations__r*    r   r   r$   r$   \   s=         11	III
JJJ    r   r$   c                     d | D             }|s| t          dd          fS d|v r7t          |          dk    rt          dd          }n#t          dd          }nt          dd          }| |z
  }||fS )a.  Extract missing values from `values`.

    Parameters
    ----------
    values: set
        Set of values to extract missing from.

    Returns
    -------
    output: set
        Set with missing values extracted.

    missing_values: MissingValues
        Object with missing value information.
    c                 4    h | ]}|t          |          |S Nr   ).0values     r   	<setcomp>z#_extract_missing.<locals>.<setcomp>|   s,       U]mE6J6J]]]]r   F)r   r%   Nr   T)r$   r   )r   missing_values_setoutput_missing_valuesr)   s       r   _extract_missingr:   l   s      !    <}U;;;;;!!!!""a''$1e$$G$G$G!! %2d$F$F$F!! -$U C C C ((F(((r   c                   (     e Zd ZdZ fdZd Z xZS )_nandictz!Dictionary with support for nans.c                     t                                          |           |                                D ]\  }}t          |          r
|| _         d S d S r4   )super__init__itemsr   	nan_value)r(   mappingkeyr6   	__class__s       r   r?   z_nandict.__init__   sc    !!!!--// 	 	JCS!! !&	 	r   c                 l    t          | d          rt          |          r| j        S t          |          )NrA   )hasattrr   rA   KeyErrorr(   rC   s     r   __missing__z_nandict.__missing__   7    4%% 	"-*<*< 	">!smmr   )r+   r,   r-   r.   r?   rI   __classcell__rD   s   @r   r<   r<      sM        ++          r   r<   c                     t          d t          |          D                       t          j        fd| D                       S )z,Map values based on its position in uniques.c                     i | ]\  }}||	S r1   r1   )r5   ivals      r   
<dictcomp>z#_map_to_integer.<locals>.<dictcomp>   s    >>>Cc1>>>r   c                      g | ]
}|         S r1   r1   )r5   vtables     r   
<listcomp>z#_map_to_integer.<locals>.<listcomp>   s    ...!U1X...r   )r<   	enumerater   array)r   r   rT   s     @r   _map_to_integerrX      sL    >>9W+=+=>>>??E8....v...///r   c                "   	 t          |           }t          |          \  }}t          |          }|                    |                                           t          j        || j                  }nP# t          $ rC t          d t          d | D                       D                       }t          d|           w xY w|f}|r|t          | |          fz  }|r|t          | |          fz  }t          |          dk    r|d         n|S )Nr   c              3   $   K   | ]}|j         V  d S r4   )r-   )r5   ts     r   	<genexpr>z!_unique_python.<locals>.<genexpr>   s$      LL!q~LLLLLLr   c              3   4   K   | ]}t          |          V  d S r4   )type)r5   rS   s     r   r]   z!_unique_python.<locals>.<genexpr>   s(      2K2Kq4772K2K2K2K2K2Kr   zPEncoders require their input argument must be uniformly strings or numbers. Got r   r   )setr:   sortedextendr*   r   rW   r   	TypeErrorrX   _get_countsr   )r   r
   r   uniques_setmissing_valuesr   typesr"   s           r   r   r      s<   
&kk&6{&C&C#^%%~--//000(7&,777 
 
 
LLs2K2KF2K2K2K/K/KLLLLL/',/ /
 
 	

 *C 30022 /FG,,..XX]]3q66+s   A2A5 5ACT)check_unknownc                2   | j         j        dv rB	 t          | |          S # t          $ r$}t	          dt          |                     d}~ww xY w|r1t          | |          }|rt	          dt          |                     t          j        ||           S )a  Helper function to encode values into [0, n_uniques - 1].

    Uses pure python method for object dtype, and numpy method for
    all other dtypes.
    The numpy method has the limitation that the `uniques` need to
    be sorted. Importantly, this is not checked but assumed to already be
    the case. The calling method needs to ensure this for all non-object
    values.

    Parameters
    ----------
    values : ndarray
        Values to encode.
    uniques : ndarray
        The unique values in `values`. If the dtype is not object, then
        `uniques` needs to be sorted.
    check_unknown : bool, default=True
        If True, check for values in `values` that are not in `unique`
        and raise an error. This is ignored for object dtype, and treated as
        True in this case. This parameter is useful for
        _BaseEncoder._transform() to avoid calling _check_unknown()
        twice.

    Returns
    -------
    encoded : ndarray
        Encoded values
    OUSz%y contains previously unseen labels: N)	r   kindrX   rG   
ValueErrorstr_check_unknownr   r   )r   r   rh   ediffs        r   _encoderq      s    : |E!!	O"67333 	O 	O 	OMSVVMMNNN	O  	V!&'22D V !TT!T!TUUUw///s     
AA		Ac                 6   d}| j         j        dv rt          |           }t          |          \  }}t          |          t                    \  |z
  }|j        oj         }|j        oj         }fd|rO|s|s|r!t          j        fd| D                       }n(t          j        t          |           t                    }t          |          }|r|                    d           |r|                    t          j                   nt          j        |           }	t          j        |	|d          }|rE|j        rt          j        | |          }n(t          j        t          |           t                    }t          j        |                                          rSt          j        |          }
|
                                r+|j        r|rt          j        |           }d||<   ||
          }t          |          }|r||fS |S )	a  
    Helper function to check for unknowns in values to be encoded.

    Uses pure python method for object dtype, and numpy method for
    all other dtypes.

    Parameters
    ----------
    values : array
        Values to check for unknowns.
    known_values : array
        Known values. Must be unique.
    return_mask : bool, default=False
        If True, return a mask of the same shape as `values` indicating
        the valid values.

    Returns
    -------
    diff : list
        The unique values present in `values` and not in `know_values`.
    valid_mask : boolean array
        Additionally returned if ``return_mask=True``.

    Nrj   c                 N    | v p j         o| d u pj        ot          |           S r4   )r%   r   r   )r6   missing_in_uniquesre   s    r   is_validz _check_unknown.<locals>.is_valid  sD    $ )%* "TM) &) )!%((r   c                 &    g | ]} |          S r1   r1   )r5   r6   ru   s     r   rU   z"_check_unknown.<locals>.<listcomp>  s!    &K&K&K5xx&K&K&Kr   rZ   Tassume_uniquer   )r   rk   r`   r:   r   r%   r   rW   onesr   r/   listr'   r   	setdiff1dr   isinisnanany)r   known_valuesreturn_mask
valid_mask
values_setmissing_in_valuesrp   nan_in_diffnone_in_diffunique_valuesdiff_is_nanis_nanru   rt   re   s               @@@r   rn   rn      sJ   2 J|E!![[
(8(D(D%
%,''*:;*G*G''K''+J4F4J0J(-M6H6M2M	 	 	 	 	 	  	> >{ >l >X&K&K&K&KF&K&K&KLL

WS[[===
Dzz 	KK 	 KK	&))|M<tLLL 	>y >WV\::

WS[[===
 8L!!%%'' 		*(4..K   *9 + +Xf--F)*Jv& [L)Dzz  ZKr   c                   .     e Zd ZdZ fdZd Zd Z xZS )_NaNCounterz$Counter with support for nan values.c                 p    t                                          |                     |                     d S r4   )r>   r?   _generate_items)r(   r@   rD   s     r   r?   z_NaNCounter.__init__D  s/    --e4455555r   c              #      K   |D ]=}t          |          s|V  t          | d          sd| _        | xj        dz  c_        >dS )z>Generate items without nans. Stores the nan counts separately.	nan_countr   r   N)r   rF   r   )r(   r@   items      r   r   z_NaNCounter._generate_itemsG  sf       	  	 D && 


4-- #!"NNaNNN	  	 r   c                 l    t          | d          rt          |          r| j        S t          |          )Nr   )rF   r   r   rG   rH   s     r   rI   z_NaNCounter.__missing__Q  rJ   r   )r+   r,   r-   r.   r?   r   rI   rK   rL   s   @r   r   r   A  s\        ..6 6 6 6 6           r   r   c                    | j         j        dv rt          |           }t          j        t          |          t          j                  }t          |          D ]<\  }}t          t                    5  ||         ||<   ddd           n# 1 swxY w Y   =|S t          | d          \  }}t          j        ||d          }t          j        |d                   rt          j        |d                   rd|d<   t          j        |||                   }	t          j        |t          j                  }||	         ||<   |S )zGet the count of each of the `uniques` in `values`.

    The counts will use the order passed in by `uniques`. For non-object dtypes,
    `uniques` is assumed to be sorted and `np.nan` is at the end.
    OUrZ   NT)r   rw   r   )r   rk   r   r   zerosr   int64rV   r   rG   r   r|   r}   r   
zeros_like)
r   r   counterr)   rO   r   r   r    uniques_in_valuesunique_valid_indicess
             r   rd   rd   W  sy    |D  f%%#g,,bh777 )) 	* 	*GAt(## * *#DMq	* * * * * * * * * * * * * * *&vTBBBM6 dKKK	xb!"" %rx'<'< % $"?='BS:TUU]7"(333F &'; <FMs   3BB	B	)FF)F)collectionsr   
contextlibr   typingr   numpyr   _missingr   r   r   r$   r:   dictr<   rX   r   rq   rn   r   rd   r1   r   r   <module>r      s                         # # # # # # ',5 & & & & &R&, &, &, &,R    J    #) #) #)L    t    0 0 0, , ,4 /3 '0 '0 '0 '0 '0TR R R Rj    '   ,    r   