
    Uge3                    r$   d dl Z d dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZmZmZ d dlmZ d Zej                            d	d
dg          d             Zej                            d	d
dg          d             Zej                            dej        ej        ej        g          ej                            dej        ej        ej        g          d                         Zej                            dej        ej        ej        g          d             Zd Zd Zd Zd Z d Z!ej                            dg dg dg ej"        g dg dg           ej"        g dg dge#           ej"        g dd ej$        d!gge#           ej"        g dd  e%d"          d!gge#           ej"        g d#g d$ge#           ej"        g d%d ej$        dgge#           ej"        g d%d  e%d"          dgge#          gg d&'          d(             Z&ej                            d	d
dg          ej                            d)d*d+g          ej                            d,dd-g          d.                                     Z'ej                            d)d*d+g          ej                            d/d0d1gd2d1gd0d1ggg d3g d4g d3gfd5d gd6d gd7d8gd6d ggg d9g d:g d;gfg          d<                         Z(d= Z)ej                            d,g d>          ej                            d?g d>          d@                         Z*ej                            dAdBdCg          ej                            dd2d0g ej"        dDdEg          g          dF                         Z+ej                            dAdBdCg          dG             Z,ej                            dHdId1gdJd1ggdIdJgd1ggej-        f ej"        d2d0gdKd0gg          d2dKgd0ggej.        f ej"        dLd!gdMd!gge#          dLdMgd!ggej-        f ej"        dLd!gdMd!gg          dLdMgd!ggej/        f ej"        d2d0gej$        d0gg          d2ej$        gd0ggej        f ej"        dLej$        gdej$        gge#          dLdgej$        ggej-        f ej"        dL e%d"          gd e%d"          gge#          dLdg e%d"          ggej-        fgg dN'          dO             Z0ej                            d	d
dg          ej                            dP ej"        d d8gge#          j1         ej"        d dQgge#          j1        g dRgej-        f ej"        d2d0ggdS          j1         ej"        d2dTggdS          j1        g dUgej2        f ej"        d d8gge#          j1         ej"        d dQgge#          j1         ej"        g dR          gej-        f ej"        dd gge#          j1         ej"        dd8gge#          j1        g dVge#f ej"        d d8gge#          j1         ej"        d ej$        gge#          j1        g dWge#f ej"        d dgge#          j1         ej"        d ej$        gge#          j1        g dXge#fgg dY'          dZ                         Z3d[ Z4ej                            d\e	e
g          d]             Z5d^ Z6d_ Z7ej                            d`d-dadbgfdcg ddfg dedfdggfgg dh'          di             Z8dj Z9ej                            dg dg dg ej"        g dkg dlg           ej"        g dg dge#          gg dm'          dn             Z:ej                            dP ej"        d d8gge#          j1         ej"        d dQgge#          j1        g dRgej-        f ej"        d2d0ggdS          j1         ej"        d2dTggdS          j1        g dUgej2        f ej"        d d8gge#          j1         ej"        d dQgge#          j1         ej"        g dR          gej-        fgg do'          dp             Z;dq Z<dr Z=ej                            dse%e>g          dt             Z?du Z@dv ZAdw ZBdx ZCdy ZDdz ZEej                            d{ej$        d e%d"          g          d|             ZFej                            d,dIdKgg d}g          d~             ZGej                            dd+d*gddg'          ej                            d,d-g dgd-dg'          d                         ZHej                            d\e	e
g          d             ZIej                            ddd0iddiddid0dddTddg          ej                            ddg dgg          d                         ZJej                            d,dcd-d8gg          d             ZKej                            d,d gdQgg          d             ZLej                            dddKiddiddiddiddidKdddTddg          d             ZMej                            d,d-d8gg          d             ZNej                            d,d gdQgg          d             ZOd ZPej                            ddKd2dddTig          d             ZQd ZRd ZSd ZTd ZUd ZVej                            ddd2dg          d             ZWej                            dd0dKdg          d             ZXej                            dg d          ej                            dg d          d                         ZYd ZZej                            d{ej$        dg          d             Z[d Z\ej                            d	dd
g          ej                            dddg          d                         Z]ej                            d	d
dg          d             Z^ej                            d	d
dg          d             Z_ej                            d	d
dg          d             Z`d Zad Zbej                            dej$        dg          d             Zcej                            dddg          ej                            dej$        dg          d                         Zdej                            dP ej"        d ej$        gge#          j1         ej"        d d8gge#          j1         ej"        d dQej$        ge#          gej-        f ej"        d ej$        gge#          j1         ej"        d d8gge#          j1         ej"        d dQej$        ge#          gej-        f ej"        dej$        ggej                  j1         ej"        dDggej                  j1         ej"        ddEej$        g          gej        fgg d'          d             Zeej                            d\e	e
g          d             Zfej                            d ej"        dej$        dDgg          j1         ej"        dej$        dgg          j1         ej"        dEgg          f ej"        g d¢g          j1         ej"        g dâg          j1         ej"        ej$        gg          f ej"        dej$        d8gge#          j1         ej"        dej$        dgg          j1         ej"        dQgge#          f ej"        g dŢge#          j1         ej"        g dƢg          j1         ej"        ej$        gge#          fg          dǄ             Zgej                            de          dɄ             Zhdʄ Ziej                            dddMgg ej"        ddMggdͬ           ej"        ddMggdά          g          ej                            ddLdMgg ej"        dLdMggdͬ           ej"        dLdMggdά          g          dЄ                         Zjdф Zkd҄ Zldӄ Zmej                            dd+d*g          dՄ             Znej                            d ej"        d gdgge#          d gej$        gej$        gg ejo        dgdgdgge#          f ej"        ej$        gdgd gge#          d gej$        gej$        gg ejo        dgej$        gej$        gge#          fg          d؄             Zpdل Zqdڄ Zrdۄ Zsd܄ Ztd݄ Zuej                            dddKiddiddiddiddidKdddTddg          dބ             Zvd߄ Zwd Zxd Zyd Zzej                            dddidd0ig          d             Z{ej                            ddd2iddig          d             Z|d Z}d Z~ej                            d\e	e
g          d             ZdS )    N)sparse)NotFittedError)OneHotEncoderOrdinalEncoder)is_scalar_nan)_convert_containerassert_allcloseassert_array_equal)CSR_CONTAINERSc                     t          j        g dg dg          } t                      }t          d          }|                    |           }|                    |           }|j        dk    sJ |j        dk    sJ t          j        |          sJ t          j        |          rJ t          |                                g dg dg           t          |                                |           d S )N         r   r   r   Fsparse_outputr      )              ?r   r   r   )r   r   r   r   r   )	nparrayr   fit_transformshaper   issparser
   toarray)X
enc_sparse	enc_denseX_trans_sparseX_trans_denses        h/var/www/surfInsights/venv3-11/lib/python3.11/site-packages/sklearn/preprocessing/tests/test_encoders.py!test_one_hot_encoder_sparse_denser$      s    	)))YYY'((AJE222I--a00N++A..M6))))&((((?>*****}-----   #<#<#<>W>W>W"X   ~--//?????    handle_unknownignoreinfrequent_if_existc                 n   t          j        g dg dg dg          }t          j        g dg          }t          d          }|                    |           t	          j        t          d          5  |                    |           d d d            n# 1 swxY w Y   t          |           }|                    |           |                                }t          |                    |          
                                t          j        g d	g                     t          ||           d S )
N)r   r   r   )r   r   r   )r   r   r   )   r   r   errorr&   Found unknown categoriesmatch)r   r   r   r   r   r   r   )r   r   r   fitpytestraises
ValueError	transformcopyr
   r   r	   r&   r   X2oh	X2_passeds        r#   #test_one_hot_encoder_handle_unknownr:   )   sa   
)))YYY			233A	999+		B 
g	.	.	.BFF1III	z)C	D	D	D  
R               
n	5	5	5BFF1III		I
Y''))
555677  
 B	"""""s   5BBBc                    t          j        g d                              d          }t          j        ddg                              d          }t          |           }|                    |           |                                }t          |                    |                                          t          j        g dg dg                     t          ||           d S )N)11111111223334444)r   55555r=   r,   )r   r   r   r   r   r   r   r   )	r   r   reshaper   r0   r5   r
   r4   r   r6   s        r#   +test_one_hot_encoder_handle_unknown_stringsrD   A   s    
22233;;GDDA	7D/	"	"	*	*7	3	3B
 
n	5	5	5BFF1III		I
Y''))
&&&(<(<(<=>>  
 r9%%%%%r%   output_dtypeinput_dtypec                 l   t          j        ddgg|           j        }t          j        ddgddgg|          }t          d|          }t	          |                    |                                          |           t	          |                    |                              |                                          |           t          d|d          }t	          |                    |          |           t	          |                    |                              |          |           d S )Nr   r   dtypeauto)
categoriesrI   F)rK   rI   r   )	r   asarrayTr   r
   r   r   r0   r4   )rF   rE   r   
X_expectedr8   s        r#   test_one_hot_encoder_dtyperO   T   s    	
QF8;///1AaVaV,LAAAJ	&	=	=	=Br''**2244jAAArvvayy**1--5577DDD	&E	R	R	RBr''**J777rvvayy**1--z:::::r%   c                    t          j        d          }|                    ddgddgd          }t          j        g dg dg| 	          }t          | 	          }t          |                    |                                          |           t          |	                    |          
                    |                                          |           t          | d
          }t          |                    |          |           t          |	                    |          
                    |          |           d S )Npandasabr   r   ABr   r   r   r   r   r   r   r   rH   F)rI   r   )r1   importorskip	DataFramer   r   r   r
   r   r   r0   r4   )rE   pdX_dfrN   r8   s        r#   !test_one_hot_encoder_dtype_pandasr]   c   s    		X	&	&B<<sCj1v6677D<<<6lKKKJ	\	*	*	*Br''--5577DDDrvvd||--d33;;==zJJJ	\	?	?	?Br''--z:::rvvd||--d33Z@@@@@r%   c                     t                      } g dg dg dg dg}|                     |           |                                 }t          g d|           |                     g d          }t          g d|           t	          j        t          d	          5  |                     d
dg           d d d            d S # 1 swxY w Y   d S )N)Maler   girlr   r   )Female)   r`   r   
   )r_   3   boy   r   )r_   [   r`         )	x0_Femalex0_Malex1_1x1_41x1_51x1_91x2_boyx2_girlx3_1x3_2x3_12x3_21x4_3x4_10x4_30)onetwothreefourfive)
one_Femaleone_Maletwo_1two_41two_51two_91	three_boy
three_girlfour_1four_2four_12four_21five_3five_10five_30z!input_features should have lengthr.   ry   rz   )r   r0   get_feature_names_outr
   r1   r2   r3   )encr   feature_namesfeature_names2s       r#   "test_one_hot_encoder_feature_namesr   s   sT   
//C!!!%%%"""$$$		A GGAJJJ--//M	
 	
 	
" 	%  * ../V/V/VWWN	
 	
 	
" 	%  * 
z)L	M	M	M 2 2!!5%.1112 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2s   CCCc                  0   t                      } t          j        ddggt                    j        }|                     |           |                                 }t          ddg|           |                     dg          }t          dd	g|           d S )
Nu   c❤t1dat2rH   u	   x0_c❤t1x0_dat2u   n👍meinput_featuresu   n👍me_c❤t1u   n👍me_dat2)r   r   r   objectrM   r0   r   r
   )r   r   r   s      r#   *test_one_hot_encoder_feature_names_unicoder      s    
//C
8V$%V4446AGGAJJJ--//MY/???--i[-IIM(.9=IIIIIr%   c                     d } t          |           }t          j        ddggt                    j        }|                    |           |                                }t          ddg|           |                    dg	          }t          d
dg|           d }t          |                              |          }d}t          j	        t          |          5  |                                 ddd           dS # 1 swxY w Y   dS )z=Check the behaviour of `feature_name_combiner` as a callable.c                 ,    | dz   t          |          z   S )N_)reprfeaturecategorys     r#   name_combinerzHtest_one_hot_encoder_custom_feature_name_combiner.<locals>.name_combiner   s    }tH~~--r%   )feature_name_combinerNoneNrH   z	x0_'None'x0_NonerR   r   za_'None'a_Nonec                     dS )Nr    r   s     r#   wrong_combinerzItest_one_hot_encoder_custom_feature_name_combiner.<locals>.wrong_combiner   s    qr%   zMWhen `feature_name_combiner` is a callable, it should return a Python string.r.   )r   r   r   r   rM   r0   r   r
   r1   r2   	TypeError)r   r   r   r   r   err_msgs         r#   1test_one_hot_encoder_custom_feature_name_combinerr      sg   . . . m
<
<
<C
64.!0002AGGAJJJ--//MY/???--cU-CCM
H-}===   n
=
=
=
A
A!
D
DCW  
y	0	0	0 $ $!!###$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $s   D  DDc                     t          j        ddgg          j        } t                      }|                    g dg           |                                d         g dgk    sJ |                    |                                           j        dk    sJ |                    g dg           |                    |                                           j        dk    sJ d S )	Nr   r   )r   r   r   r   rK   rK   )r   r*   )r   r   r   r   r*   r   )	r   r   rM   r   
set_params
get_paramsr   r   r   )r   r8   s     r#   test_one_hot_encoder_set_paramsr      s    
1a&A	BMMlll^M,,,==??<(\\\N::::A&&((.&8888MMooo.M///A&&((.&888888r%   c                 F   t          d          }|                    |           }t          dd          }|                    |           }t          |                                |           t	          j        |          r|j        dk    sJ |                                S )NrJ   r   FrK   r   csr)r   r   r	   r   r   r   format)r   r   Xtr1Xtr2s       r#   check_categorical_onehotr      s    
6
*
*
*CQD
6
?
?
?CQDDLLNND)))?4  9T[E%9%9%9%9<<>>r%   r   defr   7   abcr   r   )rc   r   r   )r   r   r   )rS   rU   cat)rR   rV   r   rH   )rS   r   r   rR   r   nan)Nr   r   )rR   r   r   )Nr   N)mixednumericr   z	mixed-nanzmixed-float-nanz
mixed-Nonezmixed-None-nanzmixed-None-float-nan)idsc                    t          t          j        |           d d dgf                   }t          |ddgddgg           t          t          j        |           d d ddgf                   }t          |g dg dg           t	          d                              |           }t          |                                g dg dg           d S )	Nr   r   )r   r   r   r   r   r   r   r   rJ   r   )r   r   r   r   r   )r   r   r   r   r   )r   r   r   r	   r   r   r   )r   Xtrs     r#   test_one_hot_encoderr      s    0 #28A;;qqq1#v#6
7
7CC1a&1a&)***
"28A;;qqq1a&y#9
:
:CC,,,5666
6
*
*
*
8
8
;
;CCKKMMOOO___#EFFFFFr%   sparse_FTdropfirstc                    g dg dg dg}t          ||          }|                    |          }t          j        |t                    }t          |                    |          |           ddgddgd	dgg}t          |d
|          }|                    |          }t          j        |          }t          |                    |          |           |g dg dg dg}t          || ddgddgg dg          }|                    |          }t          j        |t                    }d |d<   t          |                    |          |           ddgddgd	dgg}t          |ddgddgg|           }|                    |          }t          j        |t                    }d |d<   d |d d df<   t          |                    |          |           t          j        g dg dg          }t          j        d          }t          j
        t          |          5  |                    |           d d d            d S # 1 swxY w Y   d S )Nr   r   )r   r   r   r   r   rH   r   r   r   r   rJ   )r   rK   r   r   r   )6   r   8   )r   r&   rK   )r   r   r   r   )r   rK   r&   r   r   r   r   r   r   )Shape of the passed X data is not correctr.   )r   r   r   r   r   r
   inverse_transformreescaper1   r2   r3   )r&   r   r   r   r   X_trexpmsgs           r#   test_one_hot_encoder_inverser     s    
8A
gD
9
9
9CQD
(1F
#
#
#Cs,,T22C888
R1b'Ar7#A
g&t
L
L
LCQD
(1++Cs,,T22C888| ^^^^^^^^<!)A=
 
 

   ##hq'''D	30066<<< Wq"g2w'!AR))
 
 

   ##hq'''D	AAAqD	30066<<< 8YYY			*++D
)?
@
@C	z	-	-	- $ $d###$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $s   'I

IIz
X, X_transr   r   r   r   r   r   r   ry   rz   r{   rS   r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   c                    t          |                              |           }d}|rt          |d          }t          j        t
          |          5  |                    |           ddd           dS # 1 swxY w Y   dS )zCheck that `inverse_transform` raise an error with unknown samples, no
    dropped feature, and `handle_unknow="error`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14934
    r   zqSamples \[(\d )*\d\] can not be inverted when drop=None and handle_unknown='error' because they contain all zerosr   r.   N)r   r0   r   r1   r2   r3   r   )r   X_transr   r   r   s        r#   ?test_one_hot_encoder_inverse_transform_raise_error_with_unknownr   @  s    & g
.
.
.
2
21
5
5C	A 
  8$Wh77	z	-	-	- ' 'g&&&' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' 's   A66A:=A:c                      t          j        ddgddgddggt                    } t          dd	          }|                    |           }t          |                    |          |            d S )
Nr_   r   ra   r   r   rH   	if_binaryFr   r   )r   r   r   r   r   r
   r   )r   oher   s      r#   &test_one_hot_encoder_inverse_if_binaryr   `  sr    
61+!}xm<FKKKA
[
>
>
>CQDs,,T22A66666r%   )r   r   N
reset_dropc                    t          j        ddgddgddggt                    }t          | d          }|                    |           |                    |          }|                                }|                    |	           t          |	                    |          |           t          |                    |          |           t          |                                |           d S )
Nr_   r   ra   r   r   rH   Fr   r   )r   r   r   r   r0   r4   r   r   r
   r   r	   )r   r   r   r   r   r   s         r#   test_one_hot_encoder_drop_resetr   g  s     	61+!}xm<FKKKA
T
7
7
7CGGAJJJ==D--//MNN
N###s,,T22A666CMM!$$d+++s0022MBBBBBr%   methodr0   r         @      @c                     t                      }d}t          j        t          |          5   t	          ||          |            d d d            d S # 1 swxY w Y   d S )Nz'Expected 2D array, got 1D array insteadr.   )r   r1   r2   r3   getattr)r   r   r8   r   s       r#   test_X_is_not_1Dr   v  s     
B
3C	z	-	-	-  FA                 s   AAAc                 8   t          j        d          }|                    g d          }t                      }dt	          |           d}t          j        t          |          5   t          ||           |           d d d            d S # 1 swxY w Y   d S )NrQ   )   r   r*   r   z+Expected a 2-dimensional container but got z	 instead.r.   )r1   rY   Seriesr   typer2   r3   r   )r   r[   r   r8   r   s        r#   test_X_is_not_1D_pandasr     s    		X	&	&B
		,,,A	B
JQ
J
J
JC	z	-	-	-  FA                 s   (BBBzX, cat_exp, cat_dtyper   r   r   rU   rV   )r   r   r   stringzmissing-floatzmissing-np.nan-objectzmissing-float-nan-objectc                    | | d d d         fD ]}t          d          }|                    |           t          |j        t                    sJ t          |j        |          D ]\  }}|                                }t          |d                   r0t          |d                   sJ |d d         |d d         k    sJ n|                                |k    sJ t          j	        |j
        |          sJ d S )Nr@   rJ   r   )r   r0   
isinstancecategories_listziptolistr   r   
issubdtyperI   )r   cat_exp	cat_dtypeXir   resr   res_lists           r#   test_one_hot_encoder_categoriesr     s   F !DDbD'l 7 7v...#/400000COW55 	7 	7HCzz||HSW%% +$Xb\22222}CRC00000zz||s****=I666666	77 7r%   zX, X2, cats, cat_dtypedrR   rS   cint64r*   r   r   r   )NrR   z)rR   rS   r  )rR   Nr  )r   r   zobject-stringzobject-string-nonezobject-string-nanzobject-None-and-nanc                 `   t          |          }t          j        g dg dg          }t          |                    |                                           |           t          |j        d                   t          |d                   k    sJ |j        d         	                                t          |d                   k    sJ |j        d         j
        |k    sJ t          |          }t          j        t          d          5  |                    |           d d d            n# 1 swxY w Y   t          ||          }t          j        g dg dg          }t          |                    |                              |                                          |           d S )	Nr   r   r   r   r   r   r   r   r-   r.   rK   r&   )r   r   r   )r   r   r   r
   r   r   r   rK   r   r   rI   r1   r2   r3   r0   r4   )r   r7   catsr   r&   r   r   s          r#   )test_one_hot_encoder_specified_categoriesr    s   f 4
(
(
(C
(OOO___5
6
6Cs((++3355s;;;q!""d47mm3333?1$$&&$tAw--7777 ?1#y0000 4
(
(
(C	z)C	D	D	D                
4
G
G
GC
(OOO___5
6
6Cswwr{{,,R0088::C@@@@@s   D00D47D4c                  D   t          j        ddggt                    j        } t	          g dg          }t          j        g dg dg          }t          |                    |                               |                                           |           t          |	                    |                                           |           |j
        d                                         g dk    sJ t          j        |j
        d         j        t           j                  sJ t          j        d	d
gg          j        } t	          g dg          }d}t          j        t"          |          5  |	                    |            d d d            d S # 1 swxY w Y   d S )NrR   rS   rH   )rS   rR   r   r   r  r  r   r   r   )r   r   r   z%Unsorted categories are not supportedr.   )r   r   r   rM   r   r
   r0   r4   r   r   r   r   r   rI   object_r1   r2   r3   )r   r   r   r   s       r#   (test_one_hot_encoder_unsorted_categoriesr    s   
3*V,,,.A
OOO#4
5
5
5C
(OOO___5
6
6Cswwqzz++A..6688#>>>s((++3355s;;;?1$$&&///9999=+12:>>>>> 	1a&A
III;
/
/
/C
1C	z	-	-	-  !                 s   2FFFEncoderc                 :   t          j        dt           j        dg          g} | |          }t          j        ddggt                    j        }t          j        t          d          5  |                    |           ddd           dS # 1 swxY w Y   dS )zTest encoder for specified categories that nan is at the end.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    r   r   r   rH   zNan should be the last elementr.   N)	r   r   r   r   rM   r1   r2   r3   r0   r  r  r   r   s       r#   ,test_encoder_nan_ending_specified_categoriesr    s     Ha^$$%D
'T
"
"
"C
1a&(((*A	z)I	J	J	J  


                    -BBBc                  |   t          j        ddgddggt                    j        } t	          g dg dg          }t          j        g d	g d
g          }t          |                    |                                           |           |j        d         	                                g dk    sJ t          j
        |j        d         j        t           j                  sJ |j        d         	                                g dk    sJ t          j
        |j        d         j        t           j                  sJ d S )NrR   rS   r   r   rH   r   )r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   rM   r   r
   r   r   r   r   r   rI   r
  r   r   r   s      r#   7test_one_hot_encoder_specified_categories_mixed_columnsr  #  s-   
3*q!f%V4446A
OOOYYY#?
@
@
@C
(2224R4R4RS
T
TCs((++3355s;;;?1$$&&///9999=+12:>>>>>?1$$&&)))3333=+12:>>>>>>>r%   c                      t          j        d          } |                     ddgddgd          }t          |          }t	          |g dg dg           d S )	NrQ   rR   rS   r   r   rT   rW   rX   )r1   rY   rZ   r   r	   )r[   r\   r   s      r#   test_one_hot_encoder_pandasr  0  sc    		X	&	&B<<sCj1v6677D
"4
(
(CC,,,566666r%   zdrop, expected_namesx0_cx2_br   )r  x1_2r  )r   r   rS   x0_bx2_a)r   binarymanualc                     g dg dg}t          |           }|                    |           |                                }t          ||           d S )N)r   r   rR   )rS   r   rS   r   )r   r0   r   r
   )r   expected_namesr   r   r   s        r#   'test_one_hot_encoder_feature_names_dropr  9  s\     
&A
T
"
"
"CGGAJJJ--//M~}55555r%   c                  *   ddgddgddgg} t          j        g dg dg dg          }t          j        d d	g          }t          d
d          }|                    |           }t	          |j        |           t          ||           ddgddgddgg} t          j        ddgddgddgg          }t          j        d	d g          }t          d
d          }|                    |           }t	          |j        |           t          ||           d S )Nrc   yes   nori   )r   r   r   r   rB   )r   r   r   r   r   r   Fr   truerR   falser   r   )r   r   r   r   r
   	drop_idx_r	   )r   expectedexpected_drop_idxr   results        r#   *test_one_hot_encoder_drop_equals_if_binaryr*  K  sE   
er4j2u+.Ax			3335I5I5IJ H $++
[
>
>
>Cq!!Fs}&7888FH%%% ###7Ax#sc3Z#s<==H!T++
[
>
>
>Cq!!Fs}&7888FH%%%%%r%   )rc   r   r   )r"  r   r   )r   r   r   c                 ,   t                      }t          j        g dg dgd          }t          |                    |           |                    d                     t          d          }t          |                    |           |           d S )Nr   r   r   r   r   r   r   rH   float64)r   r   r   r
   r   astyper  s      r#   test_ordinal_encoderr0  c  s     

C
(IIIyyy)
9
9
9Cs((++SZZ	-B-BCCC
w
'
'
'Cs((++S11111r%   )r   r   zobject-string-catc                 T   t          |          }t          j        dgdgg          }t          |                    |           |           t          |j        d                   t          |d                   k    sJ |j        d                                         t          |d                   k    sJ |j        d         j	        |k    sJ t          |          }t          j        t          d          5  |                    |           d d d            d S # 1 swxY w Y   d S )Nr   r   r   r   r-   r.   )r   r   r   r
   r   r   rK   r   r   rI   r1   r2   r3   r0   )r   r7   r  r   r   r   s         r#   )test_ordinal_encoder_specified_categoriesr2  t  sY   2 D
)
)
)C
(SEC5>
"
"Cs((++S111q!""d47mm3333?1$$&&$tAw--7777 ?1#y0000 D
)
)
)C	z)C	D	D	D                   s   :DD!$D!c                     g dg dg} t                      }|                    |           }t          j        | t                    }t          |                    |          |           t          j        g dg dg          }t          j        d          }t          j
        t          |          5  |                    |           d d d            d S # 1 swxY w Y   d S )Nr   r   rH   )r   r   r   r   rW   r   r.   )r   r   r   r   r   r
   r   r   r   r1   r2   r3   )r   r   r   r   r   s        r#   test_ordinal_encoder_inverser4    s   	(A


CQD
(1F
#
#
#Cs,,T22C888 8\\\<<<011D
)?
@
@C	z	-	-	- $ $d###$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $s   3CCCc                     t          dd          } t          j        ddgddgdd	ggt          
          }t          j        ddgddgddggt          
          }|                     |           |                     |          }t          j        ddgddgddggd
          }t          ||           |                     |          }t          j        dd gd dgddggt          
          }t          ||           d S )Nuse_encoded_valuer&   unknown_valuerR   xrS   yr   r  rH   xyblar   r   r   r   )r   r   r   r   r0   r4   r
   r   )r   X_fitr   X_trans_encr   X_trans_invinv_exps          r#   +test_ordinal_encoder_handle_unknowns_stringrB    s
   
(;2
N
N
NCHsCj3*sCj9HHHEhdeS\C:>fMMMGGGENNN--((K
(QGb!Wq!f-W
=
=
=C{C(((''44KhddC[3*=VLLLG{G,,,,,r%   rI   c                    t          dd          }t          j        ddgddgdd	gg| 
          }t          j        ddgddgddgg| 
          }|                    |           |                    |          }t          j        ddgddgddggd
          }t          ||           |                    |          }t          j        dd gd dgddggt          
          }t          ||           d S )Nr6  r8  r      r      r   	   rH   rf      r   r   )r   r   r   r0   r4   r
   r   r   )rI   r   r>  r   r?  r   r@  rA  s           r#   ,test_ordinal_encoder_handle_unknowns_numericrI    s
   
(;4
P
P
PCHq!fq!fq!f-U;;;EhB"a1a&1???GGGENNN--((K
(QIay1a&1
A
A
AC{C(((''44KhD	D!9q!f5VDDDG{G,,,,,r%   c                     t          dt          j                  } t          j        dgdgdgg          }|                     |           |                     dgdgdgg          }t          |dgdgt          j        gg           d S )Nr6  r8  r   r   r   r*   r   )r   r   r   r   r0   r4   r
   )r   r>  r   s      r#   (test_ordinal_encoder_handle_unknowns_nanrK    s     (;26
R
R
RCHqcA3_%%EGGENNNmmaS1#sO,,Gw!qcBF8 455555r%   c                     t          dt          j        t                    } t          j        dgdgdgg          }t          j        t          d          5  |                     |           d d d            d S # 1 swxY w Y   d S )Nr6  )r&   r9  rI   r   r   r   z'dtype parameter should be a float dtyper.   )	r   r   r   intr   r1   r2   r3   r0   )r   r>  s     r#   8test_ordinal_encoder_handle_unknowns_nan_non_float_dtyperN    s     *"&  C HqcA3_%%E	z)R	S	S	S                   s   A::A>A>c                  
   t          j        g dgt                    j        } g d}t	          |          }d}t          j        t          |          5  |                    |            d d d            d S # 1 swxY w Y   d S )N)LowMediumHighrQ  rP  rH   )rP  rQ  rR  r   z*Shape mismatch: if categories is an array,r.   )	r   r   r   rM   r   r1   r2   r3   r0   )r   r  r   r   s       r#   +test_ordinal_encoder_raise_categories_shaperS    s    
<<<=VLLLNA$$$D
D
)
)
)C
6C	z	-	-	-  


                 s   A88A<?A<c            	      L   t          d          t          j        g dg dgd          } t          j        ddgd	d
ggd          t          j        ddgd	d
ggd          t          j        ddgddgg          t          j        ddgddgg          t          j        ddgd	dggd          fD ]w                               t	          fdt          d          D                       sJ t                                                                        |            xddgd	d
gg                               t	          fdt          d          D                       sJ t                                                                        |            ddgd	dgg                               t	          fdt          d          D                       sJ t                                                                        |            d S )NrJ   r   )r   r   r   r   )r   r   r   r   r.  rH   r   r   r   r*   r   rR   rS   r   r      a   b   c   dr   c                 F    g | ]}j         |         j        j        k    S r   r   rI   ).0ir   r   s     r#   
<listcomp>z'test_encoder_dtypes.<locals>.<listcomp>  s*    JJJACOA&,7JJJr%   c                 n    g | ]1}t          j        j        |         j        t           j                  2S r   )r   r   r   rI   integerr[  r\  r   s     r#   r]  z'test_encoder_dtypes.<locals>.<listcomp>  s1    VVVcoa06
CCVVVr%   c                 <    g | ]}j         |         j        d k    S )r   rZ  r`  s     r#   r]  z'test_encoder_dtypes.<locals>.<listcomp>  s(    GGG"(H4GGGr%   )	r   r   r   r0   allranger
   r4   r   )r   r   r   s    @@r#   test_encoder_dtypesrd    sD   
6
*
*
*C
((((*>*>*>?y
Q
Q
QC 	1a&1a&!111
1a&1a&!333
3*sCj)**
4,t-..
1c(QH%X666 	< 	< 	


JJJJJqJJJKKKKK3==++3355s;;;;
Q!QAGGAJJJVVVVUSTXXVVVWWWWWs}}Q''//113777
SAs8AGGAJJJGGGGeAhhGGGHHHHHs}}Q''//11377777r%   c                  (   t          j        d          } t          d          t          j        g dg dgd          }|                     dd	gd
dgddgdd          }                    |           t          fdt          d	          D                       sJ t          
                    |                                          |           |                     dd	gddgddgd          }|d         j        |d         j        |d         j        g                    |           t          fdt          d
          D                       sJ t          
                    |                                          |           d S )NrQ   rJ   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r.  rH   r   r   r   r*   r   r   rU   rV   Cr   c                 <    g | ]}j         |         j        d k    S )r   rZ  r`  s     r#   r]  z.test_encoder_dtypes_pandas.<locals>.<listcomp>  s(    FFF"(G3FFFr%   rR   rS   r   r   rU   rV   rg  c                 H    g | ]}j         |         j        |         k    S r   rZ  )r[  r\  X_typer   s     r#   r]  z.test_encoder_dtypes_pandas.<locals>.<listcomp>  s,    HHH!"(F1I5HHHr%   )r1   rY   r   r   r   rZ   r0   rb  rc  r
   r4   r   rI   )r[   r   r   rj  r   s      @@r#   test_encoder_dtypes_pandasrk    s   		X	&	&B
6
*
*
*C
(	'	'	')G)G)GH  C
 	Aq6AaV<<GLLAGGAJJJFFFFU1XXFFFGGGGGs}}Q''//113777
Aq6c
#sDDEEAflAcFL!C&,7FGGAJJJHHHHHuQxxHHHIIIIIs}}Q''//11377777r%   c                  |    t                      } ddgddgg}t          j                            | j        |           d S )Nr_   r   ra   r   )r   r   testingassert_no_warningsr   )r   r   s     r#   test_one_hot_encoder_warningro    s>    
//C
!xm$AJ!!#"3Q77777r%   missing_valuec                    dddd| g}t          |          }g dg ddddd| gg}|                    |                                          }g dg d	g d
g}t          ||           |j        |u sJ d t          |j        |j                  D             }|                    |          }t          j
        |t                    }t          |d                   rt          |d d         |d d                    t          |d                   sJ t          |d                   sJ t          |d d d df         |d d d df                    t          |dd df         |dd df                    t          |d                   sJ t          |d                   sJ d S t          ||           t          ||           d S )Nr   rf   r   r   r   )r   rf   r   r   rR   )r   rf   r   r   rR   )r   r   r   r   r   )r   r   r   r   r   r   c                 $    g | ]\  }}||         S r   r   )r[  r   r   s      r#   r]  z4test_one_hot_encoder_drop_manual.<locals>.<listcomp>0  s-       %gG  r%   rH   r@   )r@   r@   )r   r   r   r
   r   r   r   r&  r   r   r   r   r   )	rp  cats_to_dropr   r   transr   dropped_catsX_inv_transX_arrays	            r#    test_one_hot_encoder_drop_manualrx  "  s   2q"m4L
\
*
*
*C	Ar=)	A
 a  ((**E??OOO___
=Cuc"""8|#### ),S_cm)L)L  L ''..Khq'''G \"%&& 1<,l3B3.?@@@\"-.....\"-.....7111crc6?K3B3,?@@@ 	72ss7+[SbS-ABBBWV_-----[01111111<6667K00000r%   )r   r   rb   rR   c                     t          |           }d}t          j        t          |          5  |                    g dg dg dg           d d d            d S # 1 swxY w Y   d S )Nr   z-`drop` should have length equal to the numberr.   r   r   )r   r   ;   )r   r1   r2   r3   r0   )r   r   r   s      r#   test_invalid_drop_lengthr{  G  s    
T
"
"
"C=G	z	1	1	1 B B@AAAB B B B B B B B B B B B B B B B B Bs   AA!Adensityr   denserR   r   rS   r  c                    t          |           }t          | |          }g dg dg}|                    |           |                    |           t          |j        |j                   |dk    rt          |j        d           n=t          ||j        |j                  D ]!\  }}}|t          |                   |k    sJ "t          |j        t          j	                  sJ |j        j
        t          k    sJ d S )Nr   r   )r   r   rR   r~  r   r   )r   r0   r
   r   r&  r   rM  r   r   ndarrayrI   r   )r|  r   ohe_baseohe_testr   drop_catdrop_idxcat_lists           r#   test_categoriesr  O  s    7333H7>>>H	&ALLOOOLLOOOx+X-ABBBw8-q1111,/($h&:-
 -
 	7 	7(Hh CMM*h66666h("*55555#v------r%   c                 R    d |                                              d         v sJ d S )NcategoricalX_types)	_get_tags)r  s    r#   "test_encoders_has_categorical_tagsr  c  s0    GGII//11)<<<<<<<r%   kwargsmax_categoriesmin_frequency   g(\?r   )r  r  rf   rK   rJ   rR   rS   r   r   c                 H   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          d|d	d
d|                     |          }t          |j        g dg           dgdgdgdgdgg}t          j        ddgddgddgddgddgg          }|                    |          }t          ||           d dgdgdz  z   D             }|	                    |          }t          ||           |
                                }	t          ddg|	           dS )zpTest that different parameters for combine 'a', 'c', and 'd' into
    the infrequent category works as expected.rR   r   rS   r"  r   rc   r   r   r(   F)rK   r&   r   rR   r   r   er   r   c                     g | ]}|gS r   r   r[  cols     r#   r]  z2test_ohe_infrequent_two_levels.<locals>.<listcomp>      HHHcSEHHHr%   infrequent_sklearnr*   r  x0_infrequent_sklearnNr   r   r   rM   r   r0   r
   infrequent_categories_r4   r	   r   r   )
r  rK   X_trainr   X_testr'  r   expected_invX_invr   s
             r#   test_ohe_infrequent_two_levelsr  h  st    h	SEBJ.#;seaiGHIIKG
 ,  	 
 
c'll  s1OOO3DEEEecUSEC53%0Fx!Q!Q!Q!Q!Q@AAHmmF##GHg&&&HHcU.B-Ca-G%GHHHL!!'**E|U+++--//M 78-HHHHHr%   c                    t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          d	d
d|                               |          }|j        d         |j        d                  dk    sJ t          j        dgdgg          }|                    |          }t          dgdgg|           |	                                }t          dg|           |                    |          }t          dgdgg|           dS )z3Test two levels and dropping the frequent category.rR   r   rS   r"  r   rc   r   r   r(   Fr   r&   r   r  r   r   r   r  r  N)r   r   rM   r   r0   r   r&  r4   r	   r   r
   r   )r   r  r   r  r   r   	X_inverses          r#   ,test_ohe_infrequent_two_levels_drop_frequentr    s/    h	SEBJ.#;seaiGHIIKG
,	  
 
c'll  ?1cmA./36666Xusen%%FmmF##GaS1#J(((--//M/0-@@@%%g..I 456	BBBBBr%   c                 <   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          d	d
d|           }d| d         d}t	          j        t          |          5  |                    |           ddd           dS # 1 swxY w Y   dS )z_Test two levels and dropping any infrequent category removes the
    whole infrequent category.rR   r   rS   r"  r   rc   r   r   r(   Fr   r  Unable to drop category r   ( from feature 0 because it is infrequentr.   Nr   r   rM   r   r1   r2   r3   r0   r   r  r   r   s       r#   5test_ohe_infrequent_two_levels_drop_infrequent_errorsr    s   
 h	SEBJ.#;seaiGHIIKG
,	  C YT!W
X
X
XC	z	-	-	-                      .BBBrG  gQ?g{Gz?rF  c                 6   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          dd	d
d|                     |          }t          |j        ddgg           dgdgdgdgdgg}t          j        g dg dg dg dg dg          }|                    |          }t          ||           dgdgdgdgdgg}|	                    |          }t          ||           |
                                }t          g d|           dS )zkTest that different parameters for combing 'a', and 'd' into
    the infrequent category works as expected.rR   r   rS   r"  r   rc   r   r   r(   Fr&   r   r  r-  r   r   r   r,  r  )r  r  r  Nr   r  )	r  r  r   r  r'  r   r  r  r   s	            r#    test_ohe_infrequent_three_levelsr    sp     h	SEBJ.#;seaiGHIIKG
 ,E EK 	c'll  s1S#J<@@@ecUSEC53%0FxIIIyyy)))YYYOPPHmmF##GHg&&& 
				L !!'**E|U+++--//M@@@-PPPPPr%   c                 ^   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          d	d
d|                               |          }t          j        dgdgdgg          }t          ddgddgddgg|                    |                     |                    d                              |           d}t          j	        t          |          5  |                    dgdgg          }ddd           n# 1 swxY w Y   t          ddgddgg|           dS )z5Test three levels and dropping the frequent category.rR   r   rS   r"  r   rc   r   r   r(   Fr  r   r   r'   r,   r-   r.   r  N)r   r   rM   r   r0   r	   r4   r   r1   warnsUserWarning)r   r  r   r  r   r   s         r#   .test_ohe_infrequent_three_levels_drop_frequentr    s    h	SEBJ.#;seaiGHIIKG
,	  
 
c'll  XusecU+,,FaVaVaV,cmmF.C.CDDD NN(N++//888
$C	k	-	-	- 0 0--#//0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 aVaV$g.....s   &DDDc                 <   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          d	d
d|           }d| d         d}t	          j        t          |          5  |                    |           ddd           dS # 1 swxY w Y   dS )z7Test three levels and dropping the infrequent category.rR   r   rS   r"  r   rc   r   r   r(   Fr  r  r   r  r.   Nr  r  s       r#   7test_ohe_infrequent_three_levels_drop_infrequent_errorsr    s    h	SEBJ.#;seaiGHIIKG
,	  C YT!W
X
X
XC	z	-	-	-                   r  c                  "   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        } t          d	d
d                              |           }t          |j        ddgg           dgdgdgdgg}t          j        g dg dg dg dg          }|                    |          }t          ||           dgg}d}t          j
        t          |          5  |                    |           ddd           dS # 1 swxY w Y   dS )zmTest that different parameters for combining 'a', and 'd' into
    the infrequent category works as expected.rR   r   rS   r"  r   rc   r   r   r+   F)r&   r   r  r-  r  r,  badz.Found unknown categories \['bad'\] in column 0r.   N)r   r   rM   r   r0   r
   r  r4   r	   r1   r2   r3   )r  r   r  r'  r   r   s         r#   (test_ohe_infrequent_handle_unknown_errorr  
  s    h	SEBJ.#;seaiGHIIKG
eA  	c'll  s1S#J<@@@ ecUSEC5)FxIIIyyy)))DEEHmmF##GHg&&& gYF
;C	z	-	-	-  f                 s   !DDDc                    t          j        dgdz  dgdz  z   gt                    j        }t	          dg dgddd	|                     |          }dgd
gdgdgdgg}t          j        ddgddgddgddgddgg          }|                    |          }t          ||           dddgg}dgdgg}|D ]R}|                    |                              |           t          dgdgg|                    |                     SdS )zG'a' is the only frequent category, all other categories are infrequent.rR   r   r  ri   rH   r   r   rR   rS   Fr(   rK   r   r&   rS   r   r   r   r   r   r   r   Nr   )	r   r   r   rM   r   r0   r4   r	   r   )r  r  r   r  r'  r   dropsr   s           r#   5test_ohe_infrequent_two_levels_user_cats_one_frequentr  "  s^    h	SEBJ./v>>>@G
 (((),  	 
 
c'll  ecUSEC53%0Fx!Q!Q!Q!Q!Q@AAHmmF##GHg&&& kC5)EecU^F ; ;D!!%%g...!qc
CMM&$9$9::::; ;r%   c                     t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   gt          	          j        } t	          g d
gddd                              |           }t          |j        g dg           dgdgdgdgdgg}t          j        ddgddgddgddgddgg          }|                    |          }t          ||           d dgdgdz  z   D             }|
                    |          }t          ||           dS )zFTest that the order of the categories provided by a user is respected.rR   r   rS   r"  r   rc   r   r   rH   r  Fr(   r   rK   r   r&   r  )r   r   rR   r  r   r   c                     g | ]}|gS r   r   r  s     r#   r]  z<test_ohe_infrequent_two_levels_user_cats.<locals>.<listcomp>T  r  r%   r  r*   Nr   r   r   rM   r   r0   r
   r  r4   r	   r   r  r   r  r'  r   r  r  s          r#   (test_ohe_infrequent_two_levels_user_catsr  >  s\   h
cURZ	3%"*	,uqy	89    (((),	  
 
c'll  s1OOO3DEEEecUSEC53%0Fx!Q!Q!Q!Q!Q@AAHmmF##GHg&&& IHcU.B-Ca-G%GHHHL!!'**E|U+++++r%   c                     t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   gt          	          j        } t	          g d
gddd                              |           }t          |j        ddgg           dgdgdgdgdgg}t          j        g dg dg dg dg dg          }|                    |          }t          ||           dgdgdgdgdgg}|
                    |          }t          ||           dS )zTest that the order of the categories provided by a user is respected.
    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.rR   r   rS   r"  r   rc   r   r   rH   r   r   rS   rR   Fr(   r  r  r,  r  r-  r  Nr  r  s          r#   *test_ohe_infrequent_three_levels_user_catsr  Y  s^   
 h
cURZ	3%"*	,uqy	89    (((),	  
 
c'll  s1S#J<@@@ecUSEC53%0FxIIIyyy)))YYYOPPHmmF##GHg&&&
 
				L !!'**E|U+++++r%   c                      t           j        g dg df         } t          ddd          }|                    |            ddgddgg}|                    |          }t          |g d	g d
g           dS )zaTest infrequent categories where feature 0 has infrequent categories,
    and feature 1 does not.	r   r   r   r   r   r   r   r   r   	r   r   r   r   r   r   r   r   r   r   r   F)r  r   r   r   r   r   r   r   r   )r   r   r   r   N)r   c_r   r0   r4   r	   )r   r   r  r   s       r#   test_ohe_infrequent_mixedr  }  s     	)))+F+F+FFGA
q{%
P
P
PCGGAJJJ!fq!fFmmF##G GlllLLL9:::::r%   c            
         t           j        g dg dg df         } t          ddd          }|                    |                                           }t          |j        d         d	d
g           t          |j        d	         d	dg           t          |j        d
         d           |                                }t          g d|           g dg dg dg dg dg dg dg dg dg	}t          ||           g dg dg}|	                    |          }g dg dg}t          ||                                           |
                    |          }t          j        g dg dgt                    }t          ||           t          ddd                              |           }t          j        t           d          5  |	                    |           ddd           n# 1 swxY w Y   g d g d!g}|	                    |          }g d"g dg}t          ||                                           |
                    |          }t          j        g d#g d$gt                    }t          ||           dS )%z?Test infrequent categories with feature matrix with 3 features.r  )	r   r   r   r   r   rc   r   r   r   )	r   r   r   r   r   r   r   r   r   rJ   r   r(   rK   r  r&   r   r   r   rc   N)x0_0x0_3r  x1_0x1_5x1_infrequent_sklearnx2_0x2_1)r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   )r*   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r  N)r  r   NrH   r+   r-   r.   )r   r   r   )r   rc   r   )r   r   r   r   r   r   r   r   )r  r  r   )r   r  r   )r   r  r   r   r   r
   r  r   r	   r4   r   r   r   r0   r1   r2   r3   )	r   r   r   r   r'  r  X_test_transr  r  s	            r#   'test_ohe_infrequent_multiple_categoriesr    sj    	###$$$###	%	A !<Q  C ""**,,Gs1!4q!f===s1!4q"g>>>s1!4d;;;
 --//M		
 		
 		
 	   	!                          
H Hg&&&ii#F==((L )((*B*B*BCHHl2244555!!,//E8	(	(	(*I*I*IJRX  L |U+++ !G  	c!ff  
z)C	D	D	D  f               ii$F==((L(((*B*B*BCHHl2244555!!,//E8	8	8	8:V:V:VW  L |U+++++s   
G,,G03G0c            
      B   t          j        d          } |                     g dg ddddg          }t          dd	d
          }|                    |                                          }t          |j        d         ddg           t          |j        d         g d           g dg dg dg dg dg dg dg dg dg	}t          ||           |                     ddgddgdddg          }g dg dg}|	                    |          }t          ||                                           |
                    |          }t          j        ddgddggt                    }t          ||           |                     ddgddgdddg          }|	                    |                                          }g dg dg}t          ||           |
                    |          }t          j        ddgddggt                    }t          ||           dS )zHTest infrequent categories with a pandas dataframe with multiple dtypes.rQ   	rR   fr   r  r  rR   r   rS   rS   	r   r   r   rc   rc   rf   r   r   r   )strrM  r  rM  columnsrJ   r   r(   r  r   rR   rS   r   r   r   rf   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r     rf   r  rH   r   r   N)r1   rY   rZ   r   r   r   r
   r  r	   r4   r   r   r   r   )	r[   r   r   r   r'  r  r  r  r  s	            r#   .test_ohe_infrequent_multiple_categories_dtypesr    s    
	X	&	&B
@@@111	
 	
  	 	 	A !<Q  C ""**,,Gs1!4sCjAAAs1!4jjjAAA 	
H Hg&&&\\3*b"X>>PU\WWF"""$6$6$67H==((LHl2244555!!,//E8
 4	5=Q7RS  L |U+++ \\3*b!W==u~\VVF==((0022L"""$6$6$67HHl+++!!,//E8
#	$';Q&?@  L |U+++++r%   rh   )r  r  c                    t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          dd	d
d| }|                    |           |                    dgg          }t          |dgg           dS ),All user provided categories are infrequent.rR   r   rS   r"  r   rc   r   r   r(   Fr  r   Nr   )r   r   rM   r   r0   r4   r	   r  r  r   r   s       r#   $test_ohe_infrequent_one_level_errorsr  +  s     h	SEBJ.#;seaiGHIIKG
 ,E EK C GGGmmcUG$$GGqcU#####r%   c                     t          j        dgdz  gt                    j        }t	          dg dgddd|                     |          }|                    dgdgg          }t          |d	gd	gg           d
S )r  r  r   rH   r  Fr(   r  rR   r   Nr   )r   r   r   rM   r   r0   r4   r	   r  s       r#   5test_ohe_infrequent_user_cats_unknown_training_errorsr  9  s     h	{&1113G
 (((),  	 
 
c'll  mmcUSEN++GGqcA3Z(((((r%   zinput_dtype, category_dtype)OOOUUOUUSOSUSS
array_type)r   r   	dataframec                 :   t          j        dgdgg|           }t          j        ddg|          g}t          |d                              |          }t	          dgdgdgdgg||           }|                    |          }t          j        ddgddgddgddgg          }t          ||           t          |                              |          }	|	                    |          }t          j        dgdgdgdgg          }t          ||           d	S )
a"  Check that encoding work with object, unicode, and byte string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/15616
    https://github.com/scikit-learn/scikit-learn/issues/15726
    https://github.com/scikit-learn/scikit-learn/issues/19677
    rS   rR   rH   Fr   r   r   r   N)	r   r   r   r0   r   r4   r	   r   r
   )
rF   category_dtyper  r   rK   r   r  r   r'  oes
             r#   test_encoders_string_categoriesr  J  s7    	3%#{333A(C:^<<<=J
:U
C
C
C
G
G
J
JC
use$j  F mmF##Gx!Q!Q!Q!Q899HGX&&&	:	.	.	.	2	21	5	5Bll6""Gx!qcA3,--Hw)))))r%   c                  H   t          j        dgdggd          } t          j        ddgd          g}t          |d          }t          j        d          }t          j        t          |	          5  |                    |            d
d
d
           d
S # 1 swxY w Y   d
S )zCheck that this mixture of predefined categories and X raises an error.

    Categories defined as bytes can not easily be compared to data that is
    a string.
    rS   rR   UrH   SFr   zjIn column 0, the predefined categories have type 'bytes' which is incompatible with values of type 'str_'.r.   N)	r   r   r   r   r   r1   r2   r3   r0   )r   rK   r   r   s       r#   $test_mixed_string_bytes_categoricalsr  i  s     	3%#s+++A(C:S1112J
:U
C
C
CC
)	' C
 
z	-	-	-  


                 s   4BBBc                     t          j        dd| d| ggt                    j        }t	          dd                              |          }|                                }t          |ddd	|  g           d S )
NrR   rS   rH   Fr'   r   r&   x0_ar  x0_)r   r   r   rM   r   r0   r   r
   )rp  r   r   namess       r#   )test_ohe_missing_values_get_feature_namesr  ~  s     	3]C?@OOOQA
eH
E
E
E
I
I!
L
LC%%''Euvv/D]/D/DEFFFFFr%   c            	      6   t          j        d          } |                     g dt          j        dddt          j        gt                    ddd	g
          }t          j        g dg dg dg dg          }t          |          }t          ||           d S )NrQ   )dogr   Nr   r   r   r*   rH   )col1col2r  r  r  )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )	r1   rY   rZ   r   r   r   floatr   r	   )r[   dfexpected_df_transr   s       r#   %test_ohe_missing_value_support_pandasr    s    		X	&	&B	///HaArv.e<<<	
 	
   
 
 
B !!!!!!!!!!!!		
  #2
&
&CC*+++++r%   pd_nan_typepd.NAznp.nanc           
      @   t          j        d          }| dk    r|j        nt          j        }|                    d|                    dd|ddgd          i          }t          j        g d	g d
g dg dg d
g          }t          d|          }|	                    |          }t          ||           t          |j                  dk    sJ t          |j        d         d d         g d           t          j        |j        d         d                   sJ d S )NrQ   r  r  r   rR   rS   r   rH   )r   r   r   r   )r   r   r   r   )r   r   r   r   r  Fr  r   r   r@   r   )r1   rY   NAr   r   rZ   r   r   r   r   r	   lenr   r
   isnan)r  r&   r[   pd_missing_valuer  r  r   df_transs           r#   1test_ohe_missing_value_support_pandas_categoricalr    s@    
	X	&	&B +w 6 6ruuBF	BIIsC)93DJIWW	

 
B
 LLLLLLLLLL	
  eN
K
K
KC  $$H%x000s1$$$$sq)#2#.@@@8COA&r*+++++++r%   c                 2   ddgddgddgg}t          dd|           }|                    |          }t          j        g d	g d
g dg          }t	          ||           ddgg}t          j        g d	g          }d}t          j        t          |          5  |                    |          }ddd           n# 1 swxY w Y   t	          ||           |	                    |          }t          |t          j        ddggt                               dS )zZCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during transform.rR   r   rS   r   r   r   Fr   r   r&   r   r   )r   r   r   r   r   tFound unknown categories in columns \[0, 1\] during transform. These unknown categories will be encoded as all zerosr.   NrH   r   r   r   r   r	   r1   r  r  r4   r   r
   r   r&   r   r   r   rN   r  warn_msgr  s           r#   /test_ohe_drop_first_handle_unknown_ignore_warnsr    s    qC8c1X&A
E.  C ""GIIIIII	
 J GZ((( AhZF999+&&J	 
 
k	2	2	2 ( (--''( ( ( ( ( ( ( ( ( ( ( ( ( ( (GZ((( !!*--Eubhaz@@@AAAAA   B;;B?B?c                 2   ddgddgddgg}t          dd|           }|                    |          }t          j        g d	g d
g dg          }t	          ||           ddgg}t          j        g dg          }d}t          j        t          |          5  |                    |          }ddd           n# 1 swxY w Y   t	          ||           |	                    |          }t          |t          j        ddggt                               dS )zDCheck drop='if_binary' and handle_unknown='ignore' during transform.rR   r   rS   r   r   r   Fr  r  r   rW   r   r   )r   r   r   r   r  r.   NrH   r  r  s           r#   3test_ohe_drop_if_binary_handle_unknown_ignore_warnsr    s    qC8c1X&A
n  C ""GLLLLLL	
 J GZ((( AhZF<<<.))J	 
 
k	2	2	2 ( (--''( ( ( ( ( ( ( ( ( ( ( ( ( ( (GZ((( !!*--Eubhd}FCCCDDDDDr  c                 d   ddgddgddgg}t          dd| ddgddgg          }|                    |           d	dgg}t          j        ddgg          }d
}t	          j        t          |          5  |                    |          }ddd           n# 1 swxY w Y   t          ||           dS )znCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during fit with categories passed in.rR   r   rS   r   r   r   F)r   r   r&   rK   r   zqFound unknown categories in columns \[0\] during transform. These unknown categories will be encoded as all zerosr.   N)	r   r0   r   r   r1   r  r  r4   r	   )r&   r   r   r  rN   r  r   s          r#   'test_ohe_drop_first_explicit_categoriesr  	  s   
 qC8c1X&A
%#JA'	  C GGAJJJAhZFAq6(##J	A  
k	2	2	2 ( (--''( ( ( ( ( ( ( ( ( ( ( ( ( ( (GZ(((((s   3BBBc                  
   t          j        d          } |                     g dg ddddg          }t          d	          }|                    d
           d}t          j        t          |          5  |                    |           ddd           n# 1 swxY w Y   |                    |           t          j        t          |          5  |	                    |           ddd           dS # 1 swxY w Y   dS )zJRaise informative error message when pandas output and sparse_output=True.rQ   r  )r  rS   rS   )rR   rS   rR   rS   r  Tr   r4   zxPandas output does not support sparse data. Set sparse_output=False to output pandas dataframes or disable Pandas outputr.   N)
r1   rY   rZ   r   
set_outputr2   r3   r   r0   r4   )r[   r  r   r   s       r#   'test_ohe_more_informative_error_messager  $  s   		X	&	&B	IIIOOO<<sCj	Q	QB
d
+
+
+CNNXN&&&	S  
z	-	-	-  "               GGBKKK	z	-	-	-  b                 s$   8BB!BC88C<?C<c                  :   t          j        t           j        dddgg          j        } t	          t           j                  }dt           j         }t          j        t          |          5  |	                    |            ddd           dS # 1 swxY w Y   dS )zDTest ordinal encoder with nan passthrough fails when dtype=np.int32.r   r   rH   zdThere are missing values in features \[0\]. For OrdinalEncoder to encode missing values with dtype: r.   N)
r   r   r   rM   r   int32r1   r2   r3   r0   )r   r  r   s      r#   Btest_ordinal_encoder_passthrough_missing_values_float_errors_dtyper"  8  s     	263S)*++-A	bh	'	'	'B	;02	; 	;  
z	-	-	-  
q			                 r  encoded_missing_valuer7  c                    t          j        t           j        dddggt           j                  j        }t          |                               |          }t          |j                  dk    sJ t          |j        d         ddt           j        g           |
                    |          }t          || gdgdgdgg           |                    |          }t          ||           dS )	z.Test ordinal encoder with nan on float dtypes.r   r   rH   r#  r   r   r   N)r   r   r   r.  rM   r   r0   r  r   r	   r4   r   )r#  r   r  r   r  s        r#   5test_ordinal_encoder_passthrough_missing_values_floatr&  F  s     	263S)*"*===?A	.C	D	D	D	H	H	K	KBr~!####BN1%S"&'9:::ll1ooGG45usecUKLLL$$W--IIq!!!!!r%   c           
         t          j        d          }| dk    r|j        nt          j        }|                    d|                    dd|ddgd          i          }t          |	                              |          }t          |j
                  d
k    sJ t          |j
        d         dd         g d           t          j        |j
        d         d                   sJ |                    |          }t          |dgdg|gdgdgg           |                    |          }|j        dk    sJ t          |dddf         ddg           t          |dddf         ddg           t          j        |d                   sJ dS )z0Check ordinal encoder is compatible with pandas.rQ   r  r  r   rR   rS   r   rH   r%  r   r   Nr   r   r@          @r   r   )r   r   r   r   )r1   rY   r
  r   r   rZ   r   r   r0   r  r   r
   r  r4   r	   r   r   )r  r#  r[   r  r  r  r  r  s           r#   =test_ordinal_encoder_missing_value_support_pandas_categoricalr)  X  s    
	X	&	&B +w 6 6ruuBF	BIIsC)93DJIWW	

 
B 
.C	D	D	D	H	H	L	LBr~!####r~a(!,ooo>>>8BN1%b)*****||BHHuse.C-DsecUSTTT$$X..I?f$$$$y!Q'#s444yQ'#s4448IdO$$$$$$$r%   r(  )zobject-None-missing-valuezobject-nan-missing_valueznumeric-missing-valuec                    t          |          }t          j        dgt          j        gg          }t	          |                    |           |           |j        d         j        |k    sJ t          |          }t          j	        t          d          5  |                    |           ddd           dS # 1 swxY w Y   dS )z.Test ordinal encoder for specified categories.r   r   r   r-   r.   N)r   r   r   r   r
   r   r   rI   r1   r2   r3   r0   )r   r7   r  r   r  r   s         r#   =test_ordinal_encoder_specified_categories_missing_passthroughr+  y  s   L 
4	(	(	(B
(SEBF8$
%
%Cr''**C000 >!"i//// 
4	(	(	(B	z)C	D	D	D  
r


                 s   B<<C C c                 2   t          j        g dt                    g} | |          }t          j        ddggt                    j        }t	          j        t          d          5  |                    |           ddd           dS # 1 swxY w Y   dS )	zTest encoder for specified categories have duplicate values.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    )rR   rS   rR   rH   r   rR   rS   z5the predefined categories contain duplicate elements.r.   N)r   r   r   rM   r1   r2   r3   r0   r  s       r#   +test_encoder_duplicate_specified_categoriesr-    s     H___F3334D
'T
"
"
"C
3*V,,,.A	Q
 
 
   	


                 s   )BBBzX, expected_X_trans, X_testr   r   )r   r   r   )r   r(  r   r   )r   rR   rS   )r(  r   r   c                     t          dd          }|                    |           }t          ||           t          |                    |          dgg           dS )z>Test the interaction between missing values and handle_unknownr6  r@   r8  g      N)r   r   r	   r4   )r   expected_X_transr  r  r   s        r#   /test_ordinal_encoder_handle_missing_and_unknownr0    sa    8 
':"	M	M	MBq!!GG-...BLL((D6(33333r%   csr_containerc                 b   t          j        g dg dg          } | |          }t                      }d}t          j        t
          |          5  |                    |           ddd           n# 1 swxY w Y   t          j        t
          |          5  |                    |           ddd           n# 1 swxY w Y   |                    |          } | |          }t          j        t
          |          5  |                    |           ddd           dS # 1 swxY w Y   dS )zCheck that we raise proper error with sparse input in OrdinalEncoder.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19878
    r   r   z2Sparse data was passed, but dense data is requiredr.   N)	r   r   r   r1   r2   r   r0   r   r   )r1  r   X_sparseencoderr   r   r!   s          r#   test_ordinal_encoder_sparser5    s    	)))YYY'((A}QHGBG	y	0	0	0  H              	y	0	0	0 ( (h'''( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ##A&&G"]7++N	y	0	0	0 2 2!!.1112 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2s6   A33A7:A7B;;B?B?D$$D(+D(c                  h   t          j        g d          ddt           j        f         } t          g dgdd          }|                    |            t          g dgd          }t          j        t          d	
          5  |                    |            ddd           dS # 1 swxY w Y   dS )zCheck OrdinalEncoder.fit works with unseen category when
    `handle_unknown="use_encoded_value"`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    )r   r   r   r   r   r   N)r@   r   r   r6  rD  )rK   r&   r9  r+   r  r-   r.   )r   r   newaxisr   r0   r1   r2   r3   )r   r  s     r#   -test_ordinal_encoder_fit_with_unseen_categoryr8    s     	###$$QQQ
]3A	JJ<0CSW
 
 
B FF1III	JJJ<	H	H	HB	z)C	D	D	D  
q			                 s   B''B+.B+r  AAOr  r  c                     t          dd          }|                    |            |                    |          }t          |ddgg           dS )zChecks that `OrdinalEncoder` transforms string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    r6  ir8  r   N)r   r0   r4   r	   )r  r  r   r   s       r#   1test_ordinal_encoder_handle_unknown_string_dtypesr<  	  sV    * (;2
N
N
NCGGGmmF##GGr1gY'''''r%   c                  R   t          j        g d                              dd          } t                                          |           }t          |j        t          j        | d          j                   |	                    |           }t          |dgdgdgdgg           dS )	zCheck that `OrdinalEncoder` accepts Python integers that are potentially
    larger than 64 bits.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/20721
    )l   	HP
1& l   	H]viel   	 :?i}Ga l   IRK2e6kr@   r   r   )axisr   r   N)
r   r   rC   r   r0   r
   r   sortrM   r4   )r   r4  r   s      r#   #test_ordinal_encoder_python_integerr@  %  s     		
 	
 	
	 	 gb!nn  ""1%%Gw*BGAA,>,>,>,@AAA""Gw!qcA3 455555r%   c                      t          j        d          } g d}|                     g dg|          }t                                          |          }|                                }t          ||           dS )z-Check feature names out is same as the input.rQ   )rS   r   rR   r  r  N)r1   rY   rZ   r   r0   r   r
   )r[   r  r   r   feature_names_outs        r#   .test_ordinal_encoder_features_names_out_pandasrC  9  sx    		X	&	&BOOE
iii[%00A




q
!
!C1133u/00000r%   c                  V   t          j        dgdgt           j        ggt                    } t	          dt           j        d                              |           }|                    |           }t          |dgdgdgg           t          j        d	gt           j        ggt                    }|                    |          }t          |t           j        gdgg           |                    |          }|d         d         J t          j	        |d         d                   sJ d
S )zECheck interactions between encode_unknown and missing value encoding.rR   rS   rH   r6  r&   r9  r#  r   r   r   N)
r   r   r   r   r   r0   r4   r	   r   r  )r   r  r   r  r  X_roundtrips         r#   0test_ordinal_encoder_unknown_missing_interactionrH  E  s$    	3%#)888A	*f 
 
 
 
c!ff	  ll1ooGGqcA3-... Xurvh'v666F<<''LLBF8bT"2333 &&|44K q>!$$$ 8KN1%&&&&&&&r%   with_pandasc                    t          j        ddgddgdt           j        ggt                    }d}| r3t	          j        d          }|                    |d	d
g          }|dz   }n|dz   }t          d          }t	          j        t          |          5  |
                    |           ddd           dS # 1 swxY w Y   dS )zXCheck OrdinalEncoder errors when encoded_missing_value is used by
    an known category.rR   r   rS   r   r   rH   zTencoded_missing_value \(1\) is already used to encode a known category in features: rQ   letterpetr  z	\['pet'\]z\[1\]r   r%  r.   N)r   r   r   r   r1   rY   rZ   r   r2   r3   r0   )rI  r   	error_msgr[   r  s        r#   0test_ordinal_encoder_encoded_missing_value_errorrN  c  s    	3,esBFm<FKKKA
	 
  ) **LLXu$5L66,		(		a	0	0	0B	z	3	3	3  
q			                 s   B99B= B=z4X_train, X_test_trans_expected, X_roundtrip_expected1c                    t          dt          j        t          j                                      |           }t          j        dgt          j        gdgg          }|                    |          }t          ||           |                    |          }|j        d         }t          |          D ]K}||df         }	||df         }
|	|
J t          |	          rt          j        |
          sJ C|
|	k    sJ LdS )znCheck transform when unknown_value and encoded_missing_value is nan.

    Non-regression test for #24082.
    r6  rF  rO  rS   r   N)r   r   r   r0   r   r4   r	   r   r   rc  r   r  )r  X_test_trans_expectedX_roundtrip_expectedr  r  r  rG  	n_samplesr\  expected_valvals              r#   9test_ordinal_encoder_unknown_missing_interaction_both_nanrV  }  s"   4 
*f f
 
 
 
c'll	  Xurvh.//F<<''L L"7888&&|44K$*1-I9 	' 	'+AqD1!Q$;;;;<(( 	'8C==    ,&&&&&	' 	'r%   c                     t          j        d          } |                     ddgddgd          }t                      }|                    d           d}t          j        t          |	          5  |                    |           d
d
d
           n# 1 swxY w Y   t          d                              d          }t          d                              d          }|                    |          }|                    |          }t          |	                                |           t          |                                |j                   d
S )z*Check OneHotEncoder works with set_output.rQ   rR   rS   r   r   rT   r  zCPandas output does not support sparse data. Set sparse_output=Falser.   NFr   default)r1   rY   rZ   r   r  r2   r3   r   r	   to_numpyr
   r   r  )r[   r\   r   r/   ohe_default
ohe_pandas	X_defaultX_pandass           r#   test_one_hot_encoder_set_outputr^    s~   		X	&	&B<<sCj1v6677D
//CNNXN&&&QE	z	/	/	/    $                               e444??)?TTKU333>>>RRJ))$//I''--HH%%''333z77998;KLLLLLs   2BBBc                     t          j        d          } |                     ddgddgd          }t                                          d          }t                                          d          }|                    |          }|                    |          }t          |                                |           t          |	                                |j
                   d	S )
z+Check OrdinalEncoder works with set_output.rQ   rR   rS   r   r   rT   rX  r  N)r1   rY   rZ   r   r  r   r	   rY  r
   r   r  )r[   r\   ord_default
ord_pandasr\  r]  s         r#   test_ordinal_set_outputrb    s    		X	&	&B<<sCj1v6677D ""--	-BBK!!,,x,@@J))$//I''--HH%%''333z77998;KLLLLLr%   c                  8   g dddgg} t          |           }|                    ddgg           t          |           t          |j                  k    sJ t	          |j                  D ]-\  }}|j        t          k    sJ t          | |         |           .dS )zjCheck that the categories_ dtype is `object` for string categories

    Regression test for gh-25171.
    )asmmaseasrasacsrO  2r   rd  N)r   r0   r  r   	enumeraterI   r   r
   )rK   r   nr   s       r#    test_predefined_categories_dtyperl    s    
 655SzBJ
:
.
.
.CGGdC[Mz??c#/222222CO,, / /3yF"""":a=#..../ /r%   c                     t          j        dgdgt           j        ggt                    } t	          d                              |           }t          |dgdgdgg           t	          dd	                              |           }t          j        d
gg          }|                    |          }t          |dgg           dS )zBCheck missing value or unknown encoding can equal the cardinality.r   r   rH   r   r%  r   r   r6  r8  snakeN)	r   r   r   r   r   r   r	   r0   r4   )r   r   r   r  s       r#   1test_ordinal_encoder_missing_unknown_encoding_maxro    s    
5'E7RVH-V<<<A1555CCAFFGGqcA3_---
(;1
M
M
M
Q
QRS
T
TCXyk""FmmF##GGqcU#####r%   c                     t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   dgdz  z   gt                    j        } t	          dd	d
                              |           }t          |                                g d           |j        d         |j	        d                  dk    sJ t          j        dgdz  dgdz  z   dgdz  z   gt                    j        } t	          dd	d                              |           }t          |                                dg           |j        d         |j	        d                  dk    sJ t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   dgdz  z   gt                    j        } t	          dd	dg                              |           }t          |                                g d           |j        d         |j	        d                  dk    sJ t	          dd	d                              |           }t          |                                g d           |j	        J dS )zkCheck drop_idx is defined correctly with infrequent categories.

    Non-regression test for gh-25550.
    rR   r   rS   r*   r   r   r  rH   Fr   )r  r   r   )r  x0_dx0_er  r   rc   r   r  )r  r  rr  r  N)r  r  rq  rr  r  )
r   r   r   rM   r   r0   r
   r   r   r&  )r   r   s     r#   #test_drop_idx_infrequent_categoriesrs    s   
 	
cUQY	#	*cUQY	6#	BC6	 	 	  au7
K
K
K
O
OPQ
R
RC!!##%V%V%V   ?1cmA./36666
3%!)seai'3%"*45VDDDFA
au;
O
O
O
S
STU
V
VCs00225L4MNNN?1cmA./36666

cUQY	#	*cUQY	6#	BC6	 	 	  auC5
I
I
I
M
Ma
P
PC!!##%V%V%V   ?1cmA./36666
au4
H
H
H
L
LQ
O
OC!!##AAA   =     r%   c                    t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   g          j        }t          dd	d
d|                     |          }t          |j        g dg           t          |j        ddgg           dgdgdgdgdgg}dgdgdgdgd
gg}|                    |          }t          ||           |
                    |          }dgdgdgdgdgg}t          ||           dS )zGTest parameters for grouping 'a', and 'd' into the infrequent category.rR   r   rS   r"  r   rc   r   r   r6  r@   r8  r  r  r   r   r   r  Nr   )r   r   rM   r   r0   r
   r   r  r4   r	   r   )r  r  ordinalr  expected_transr   r  expected_inverses           r#   ,test_ordinal_encoder_infrequent_three_levelsrx    sR    h	SEBJ.#;seaiGHIIKG *" @F 	c'll  w*-A-A-A,BCCCw5c
|DDDecUSEC53%0FcA3aS2$/N''GG^,,,))'22I					 y"233333r%   c                     t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   gt          	          j        } t	          g d
gddd                              |           }t          |j        g d
g           t          |j        ddgg           dgdgdgdgdgg}dgdgdgdgdgg}|	                    |          }t          ||           |                    |          }dgdgdgdgdgg}t          ||           dS )zTest that the order of the categories provided by a user is respected.

    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.
    rR   r   rS   r"  r   rc   r   r   rH   r  r6  r@   )rK   r  r&   r9  r  r   r   r   r  N)r   r   r   rM   r   r0   r
   r   r  r4   r	   r   )r  ru  r  rv  r   r  rw  s          r#   6test_ordinal_encoder_infrequent_three_levels_user_catsrz  @  sh    h
cURZ	3%"*	,uqy	89    ((()*	  
 
c'll  w*-A-A-A,BCCCw5c
|DDDecUSEC53%0FcA3aS2$/N''GG^,,,))'22I					 y"233333r%   c                     t          j        g dg df          } t          d                              |           }t	          |j        d         ddg           |j        d         J ddgddgg}ddgddgg}|                    |          }t          ||           |                    |          }t          j	        ddgd	dggt          
          }t	          ||           dS )zETest when feature 0 has infrequent categories and feature 1 does not.r  r  r   r  r   r   r   Nr  rH   )r   column_stackr   r0   r
   r  r4   r	   r   r   r   )r   ru  r  rv  r   r  rw  s          r#   %test_ordinal_encoder_infrequent_mixedr~  d  s	    	4446Q6Q6QRSSAA...22155Gw5a81a&AAA)!,444!fq!fF!fq!f%N''GG^,,,))'22Ix!Q*>)B C6RRRy"233333r%   c            	         t          j        d          } |                     g d          }|                     g dg d|                     dgdz  dgdz  z   d	gz   d
gz   |          dg d          }t          d                              |          }t          |j        d         ddg           t          |j        d         g d           t          |j        d         d
d	g           |                     g dg d|                     dgd	gz   d
gz   dgz   |          dg d          }g dg dg dg dg}|	                    |          }t          ||           dS )zHTest infrequent categories with a pandas DataFrame with multiple dtypes.rQ   )birdr   r   rn  r  r  r   r*   r   r   rn  r  rH   )r  rM  r  r  r|  r   rR   rS   r   r  r   )rR   rS   r  r   )rf   r   rc   r   )r   r   r   )r   r   r   )r   r   r   r  N)r1   rY   CategoricalDtyperZ   r   r   r0   r
   r  r4   r	   )r[   categorical_dtyper   ru  r  rv  r   s          r#   :test_ordinal_encoder_infrequent_multiple_categories_dtypesr  y  s    
	X	&	&B++,K,K,KLL
@@@11199!ugk)WI5@' %  	
 	
 .-- 	 
	 
	A A...22155G w5a83*EEEw5a8***EEEw5a867:KLLL\\'''!>>997)#vh.%8' %  	
 	
 .--  
 
F  iiIIIyyyAN''GG^,,,,,r%   c                     t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   t           j        gz   gt          	          j        } t          d
ddd                              |           }t          |j        g dg           t          j        dgdgdgdgdgt           j        ggt          	          }dgdgdgdgdgdgg}|	                    |          }t          ||           dS )zJCheck behavior of unknown_value and encoded_missing_value with infrequent.rR   r   rS   r"  r   rc   r   r   rH   r6  r   )r&   r9  r  r#  r  r  r   r   N)r   r   r   r   rM   r   r0   r
   r  r4   r	   )r  ru  r  rv  r   s        r#   .test_ordinal_encoder_infrequent_custom_mappingr    s   h
cURZ	3%"*	,uqy	8BF8	CDF    *	  
 
c'll  w57HIIIXusecUSEC526(C6RRRFcA3aS1#s3N''GG^,,,,,r%   c                    t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   gt          	          j        }t	          di | d
dd                    |          }t	          d
d                              |          }dgdgdgdgdgg}t          |                    |          |                    |                     dS )zMAll categories are considered frequent have same encoding as default encoder.rR   r   rS   r"  r   rc   r   r   rH   r6  r@   r8  r  Nr   r   r   r   rM   r   r0   r	   r4   )r  r  adjusted_encoderdefault_encoderr  s        r#   !test_ordinal_encoder_all_frequentr    s    h
cURZ	3%"*	,uqy	89    &  
!4B   	c'll  %*"  	c'll  ecUSEC53%0F""6**O,E,Ef,M,M    r%   d   c                 4   t          j        dgdz  dgdz  z   dgdz  z   dgdz  z   gt          	          j        }t	          di | d
dd                    |          }dgdgdgdgdgg}t          |                    |          dgdgdgdgdgg           dS )zAWhen all categories are infrequent, they are all encoded as zero.rR   r   rS   r"  r   rc   r   r   rH   r6  r@   r8  r  r   Nr   r  )r  r  r4  r  s       r#   #test_ordinal_encoder_all_infrequentr    s     h
cURZ	3%"*	,uqy	89      
!4B   	c'll  ecUSEC53%0FG%%f--aS1#sRD/IJJJJJr%   c                     t          j        t           j        gdz  dgdz  z   dgdz  z   dgz   dgz   gt                    j        } t          d	
                              |           }t          j        dddt           j        ggt                    j        }|                    |          }t          |dgdgdgt           j        gg           dS )z5Check behavior when missing value appears frequently.r"  r   rc   r   r   rn  deerrH   r   r|  r   r   r   N	r   r   r   r   rM   r   r0   r4   r	   r   ru  r  r   s       r#   -test_ordinal_encoder_missing_appears_frequentr    s    

&B%2	%!	3wi	?6(	JK	 	 	   A...22155GXrv67vFFFHF''GGqcA3bfX677777r%   c            	         t          j        t           j        gdgdz  z   dgdz  z   dgz   dgz   dgdz  d	gdz  z   gt          
          j        } t          d                              |           }t          j        ddgdd	gt           j        d	gdd	gddggt          
          }|                    |          }t          |ddgddgt           j        dgddgddgg           dS )z7Check behavior when missing value appears infrequently.r   rc   r   r   rn  r  redrG  greenrH   r*   )r  r   r   r   Nr  r  s       r#   /test_ordinal_encoder_missing_appears_infrequentr    s    
 	VHw|#ugk1WI=HGaK7)a-'	
 	 	 	   1---11!44GXeWVWGEN	
 	 	 	F ''GGq!fq!frvqkAq6Aq6JKKKKKr%   c                     t          j        dgdgdggt                    } | g dg          }t          j        t
                    5  |                    |           ddd           dS # 1 swxY w Y   dS )a!  Check that we raise a `NotFittedError` by calling transform before fit with
    the encoders.

    One could expect that the passing the `categories` argument to the encoder
    would make it stateless. However, `fit` is making a couple of check, such as the
    position of `np.nan`.
    rU   rV   rg  rH   rf  r   N)r   r   r   r1   r2   r   r4   )r  r   r4  s      r#   test_encoder_not_fittedr  	  s     	3%#&f555Ag///!2333G	~	&	&  !                 s   
A--A14A1)r   numpyr   r1   scipyr   sklearn.exceptionsr   sklearn.preprocessingr   r   sklearn.utils._missingr   sklearn.utils._testingr   r	   r
   sklearn.utils.fixesr   r$   markparametrizer:   rD   r!  float32r.  rO   r]   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r
  r_  str_r   rM   r   r  r  r  r  r  r  r*  r0  r2  r4  rB  rM  rI  rK  rN  rS  rd  rk  ro  rx  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r"  r&  r)  r+  r-  r0  r5  r8  r<  r@  rC  rH  rN  rL   rV  r^  rb  rl  ro  rs  rx  rz  r~  r  r  r  r  r  r  r  r   r%   r#   <module>r     s   				            - - - - - - ? ? ? ? ? ? ? ? 0 0 0 0 0 0         
 / . . . . .@ @ @. )H6K+LMM# # NM#. )H6K+LMM& & NM&$ "(BJ
)KLL2:rz(JKK
; 
; LK ML
; "(BJ
)KLLA A MLA92 92 92xJ J J$ $ $4	9 	9 	9
 
 
 	(+++zzz*++###%6%6%67vFFF///C#78GGG///Cuu#=>fMMM"""OOO4FCCC///C#67vFFF///Cut#<=VLLL			 		 		   .G G/ .G )H6K+LMMUDM22$11,$ ,$ 21 32 NM,$^ UDM22b'Ar7QG	$yyy)))YYY&GHS\E3<'3%F__ooo?	
	 	' '	 	 32'*7 7 7 !=!=!=>>'C'C'CDD
C 
C ED ?>
C E?#;<<1vxrxc
';';<==  >= =< E?#;<<  =< "+r{	#uenrd%;RZH	Aq6Aq6"	#	#q!fqc]BJ?BHsElS%L1@@@3Z%!J	

 
C<#u.	/	/3*ug1FP	Aq6BFA;'	(	(Arv;*<bjIBHsBFmdBF^4FCCC4[26(#J	
 BHsEE%LL)D%%,,+?@OOO4[55<<.)J	
*	 	 	/     B7 7C   B7" )H6K+LMM BHsCj\0002BHsCj\0002__J		
 BHq!fXW---/BHq!fXW---/YYKH		
 BHsCj\0002BHsCj\0002RXooo&&'J		
 BHtSk]&1113BHtSk]&1113		
 BHsCj\0002BHsBFm_F3335__		
 BHsDk]&1113BHsBFm_F3335		
?%L	 	 	Q  0 0bA Ac0 0 NMdA(  $ ]N$CDD
 
 ED

? 
? 
?7 7 7 	66"#	.../	()
 	&%%   6 6 6& & &0 	(+++{{{+,,###%6%6%67vFFF
 	'&&   2 2 2  BHsCj\0002BHsCj\0002__J		
 BHq!fXW---/BHq!fXW---/YYKH		
 BHsCj\0002BHsCj\0002RXooo&&'J		
( 	322-   0 1 0"$ $ $- - - 5#,//- - 0/-6 6 6	 	 	  8 8 868 8 8,8 8 8 264u*FGG!1 !1 HG!1H 5!*.A.A.A!BCCB B DCB T5M'7JKK'===!9?RSS. . TS LK.$ ]N$CDD= = ED= 	1	"	$q11r22	 	 1E1E1E0F'GHHI I IH	 	I6 +w!>??C C @?C. 3%#00  10" 	1	!	!	$	$q11q11 Q Q Q< 'C5!122/ / 32/. 3%#00  10  0 !a88?A:NO ; ; ;2, , ,6!, !, !,H; ; ;$X, X, X,v>, >, >,B bA$N$N#OPP
$ 
$ QP
$ a1$M$M#NOO) ) PO)  !#M#M#M  'E'E'EFF* * GF *6  * 264.99G G :9G, , ,. ),A8+LMM((;<<, , =< NM,< )H6K+LMM"B "B NM"BJ )H6K+LMM!E !E NM!EH )H6K+LMM) ) NM)4  (   0262,??" " @?"" ((;<<0262,??% % @? =<%>  3-77793*V44463RV,F;;;<
	 3-77793*V44463RV,F;;;<
	 3-
;;;=3%
33353RV,--.
	%4	 	 	9  ! !D E! !D$ ]N$CDD  ED ! BHsBFC()**,BHsBFC()**,BHseW	
 BHooo&'')BHooo&'')BHrvhZ  	
 BHsBFC()888:BHsBFC()**,BHseWF+++	
 BHooo&f5557BHooo&'')BHrvhZv...	
! 24 43 24 .992 2 :92,  " 
4+c***4+c***  
s3*S)))3*S))) 	( 	(   	(6 6 6(	1 	1 	1' ' '< u66  762 :
 BHsecU^6222S26(RVH%BJvv.f===	
 BHrvhu-V<<<S26(RVH%BJx"&2&AAA	
 &' '' &'BM M M.M M M / / /"	$ 	$ 	$!! !! !!H 	1	!	!	$	$q11q11 4 4 46!4 !4 !4H4 4 4*-- -- --`- - -* 	1	!   ( 	1	# 
K 
K 
K
8 
8 
8L L L8 ]N$CDD  ED  r%   