
    -jM                    D   S r SSKJr  SrS/rSSKJr  SSKrSSKJ	r	J
r
JrJrJrJrJrJrJrJrJr  SSKJrJrJrJrJrJr  SS	KJrJr  SS
KJrJ r J!r!J"r"  SSK#J$r$  \(       a  SSK%J&r&  SSKJ'r'  SSK(J)r)J*r*J+r+  Sr,\
\\-\-4   \-\-/S4   r. " S S\\5      r/ " S S\!5      r0g)zCUse the HTMLParser library to parse HTML files that aren't too bad.    )annotationsMITHTMLParserTreeBuilder)
HTMLParserN)AnyCallablecastDictIterableListOptionalTYPE_CHECKINGTupleTypeUnion)AttributeDictCDataCommentDeclarationDoctypeProcessingInstruction)EntitySubstitutionUnicodeDammit)DetectsXMLParsedAsHTMLHTMLHTMLTreeBuilderSTRICTParserRejectedMarkup)BeautifulSoup)NavigableString)	_Encoding
_Encodings
_RawMarkupzhtml.parserc                  `   \ rS rSr% SrS\S'   SrS\S'    \S.       SS jjrS\S
'   S\S'   S	\S'   S S jr      S!S jr	 S"       S#S jjr
S"S$S jjrS%S jr\R                  " S5      r\R                  " S5      r\S&S j5       rS'S jrS'S jrS%S jrS(S jrS%S jrS%S jrSrg))BeautifulSoupHTMLParser>   replacestrREPLACEignoreIGNOREon_duplicate_attributesoupr    r.   &Union[str, _DuplicateAttributeHandler]c                   Xl         X l        UR                  R                  U l        [        R
                  " U /UQ70 UD6  / U l        U R                  5         g N)r/   r.   builderattribute_dict_classr   __init__already_closed_empty_element_initialize_xml_detector)selfr/   r.   argskwargss        O/mnt/data/Frawley/.venv/lib/python3.13/site-packages/bs4/builder/_htmlparser.pyr5    BeautifulSoupHTMLParser.__init__U   sO     	&<#$(LL$E$E!D24262 -/)%%'    z	List[str]r6   c                    [        U5      er2   r   )r8   messages     r;   errorBeautifulSoupHTMLParser.errorp   s     #7++r=   c                D    U R                  XSS9  U R                  USS9  g)zmHandle an incoming empty-element tag.

html.parser only calls this method when the markup looks like
<tag/>.
F)handle_empty_elementcheck_already_closedN)handle_starttaghandle_endtag)r8   tagattrss      r;   handle_startendtag*BeautifulSoupHTMLParser.handle_startendtag   s,     	SeD 	3U;r=   c           	     p   U R                  5       nU Hc  u  pVUc  SnXT;   aP  U R                  nXpR                  :X  a  M,  USU R                  4;   a  XdU'   MD  [	        [
        U5      nU" XEU5        M_  XdU'   Me     U R                  R                  R                  (       a  U R                  5       u  pOS=pU R                  R                  USSXHU	S9n
U
bC  U
R                  (       a2  U(       a+  U R                  USS9  U R                  R                  U5        U R                  c  U R!                  U5        gg)zHandle an opening tag, e.g. '<tag>'

:param handle_empty_element: True if this tag is known to be
    an empty-element tag (i.e. there is not expected to be any
    closing tag).
N )
sourceline	sourceposFrD   )r4   r.   r,   r*   r	   _DuplicateAttributeHandlerr/   r3   store_line_numbersgetposrF   is_empty_elementrG   r6   append_root_tag_name_root_tag_encountered)r8   rH   rI   rC   	attr_dictkeyvalueon_duperN   rO   tagObjs              r;   rF   'BeautifulSoupHTMLParser.handle_starttag   s.    $(#<#<#>	JC } 55kk)t|| 44%*cN"#=wGGIE2!&#%  , 99//$(KKM!J	%))J**tY + 
 &"9"9>R s? --44S9&&&s+ 'r=   c                    U(       a+  XR                   ;   a  U R                   R                  U5        gU R                  R                  U5        g)zHandle a closing tag, e.g. '</tag>'

:param tag: A tag name.
:param check_already_closed: True if this tag is expected to
   be the closing portion of an empty-element tag,
   e.g. '<tag></tag>'.
N)r6   remover/   rG   )r8   rH   rE   s      r;   rG   %BeautifulSoupHTMLParser.handle_endtag   s:      C+L+L$L
 --44S9II##C(r=   c                :    U R                   R                  U5        g)z4Handle some textual data that shows up between tags.N)r/   handle_datar8   datas     r;   ra   #BeautifulSoupHTMLParser.handle_data   s    		d#r=   z^([0-9]+)(.*)z^([0-9a-f]+)(.*)c                   SnSnSnSnU R                   nUR                  S5      (       d  UR                  S5      (       a  USS nSnU R                  nSn [        X5      nUc  SnUnO[        R                  " U5      u  p#X#U4$ ! [         aG    UR                  U5      nUb0  [        UR                  5       S	   U5      nUR                  5       S   n Ntf = f)
aX  Convert a numeric character reference into an actual character.

:param name: The number of the character reference, as
  obtained by html.parser

:return: A 3-tuple (dereferenced, replacement_added,
  extra_data). `dereferenced` is the dereferenced character
  reference, or the empty string if there was no
  reference. `replacement_added` is True if the reference
  could only be dereferenced by replacing content with U+FFFD
  REPLACEMENT CHARACTER. `extra_data` is a portion of data
  following the character reference, which was deemed to be
  normal data and not part of the reference at all.
rM   F
   xX   N   r   )	&_DECIMAL_REFERENCE_WITH_FOLLOWING_DATA
startswith"_HEX_REFERENCE_WITH_FOLLOWING_DATAint
ValueErrorsearchgroupsr   numeric_character_reference)	clsnamedereferencedreplacement_added
extra_databasereg	real_namematchs	            r;   (_dereference_numeric_character_reference@BeautifulSoupHTMLParser._dereference_numeric_character_reference   s      !&
88??34??3#7#78DD88C"&		/DI$ LJ.;.W.WXa.b+L
::-  	/ JJt$E q 148	"\\^A.
	/s   B ACCc                    U R                  U5      u  p#nU(       a  SU R                  l        Ub  U R                  U5        Ub  U R                  U5        gg)zHandle a numeric character reference by converting it to the
corresponding Unicode character and treating it as textual
data.

:param name: Character number, possibly in hexadecimal.
TN)r|   r/   contains_replacement_charactersra   )r8   rt   ru   rv   rw   s        r;   handle_charref&BeautifulSoupHTMLParser.handle_charref"  sV     7;6c6cdh6i38<DII5#\*!Z( "r=   c                z    [         R                  R                  U5      nUb  UnOSU-  nU R                  U5        g)zHandle a named entity reference by converting it to the
corresponding Unicode character(s) and treating it as textual
data.

:param name: Name of the entity reference.
Nz&%s)r   HTML_ENTITY_TO_CHARACTERgetra   )r8   rt   	characterrc   s       r;   handle_entityref(BeautifulSoupHTMLParser.handle_entityref1  s>     '??CCDI	 D 4<Dr=   c                    U R                   R                  5         U R                   R                  U5        U R                   R                  [        5        g)z?Handle an HTML comment.

:param data: The text of the comment.
N)r/   endDatara   r   rb   s     r;   handle_comment&BeautifulSoupHTMLParser.handle_commentD  s8    
 					d#		'"r=   c                    U R                   R                  5         U[        S5      S nU R                   R                  U5        U R                   R                  [        5        g)zIHandle a DOCTYPE declaration.

:param data: The text of the declaration.
zDOCTYPE N)r/   r   lenra   r   )r8   decls     r;   handle_decl#BeautifulSoupHTMLParser.handle_declM  sI    
 			C
O%&		d#		'"r=   c                "   UR                  5       R                  S5      (       a  [        nU[        S5      S nO[        nU R
                  R                  5         U R
                  R                  U5        U R
                  R                  U5        g)zkHandle a declaration of unknown type -- probably a CDATA block.

:param data: The text of the declaration.
zCDATA[N)upperrl   r   r   r   r/   r   ra   )r8   rc   rs   s      r;   unknown_decl$BeautifulSoupHTMLParser.unknown_declW  si     ::<""8,,CH(DC				d#		#r=   c                    U R                   R                  5         U R                   R                  U5        U R                  U5        U R                   R                  [        5        g)zLHandle a processing instruction.

:param data: The text of the instruction.
N)r/   r   ra   _document_might_be_xmlr   rb   s     r;   	handle_pi!BeautifulSoupHTMLParser.handle_pif  sG    
 					d###D)		/0r=   )r6   r4   r.   r/   N)r/   r    r9   r   r.   r0   r:   r   )r?   r)   returnNone)rH   r)   rI   List[Tuple[str, Optional[str]]]r   r   )T)rH   r)   rI   r   rC   boolr   r   )rH   r)   rE   r   r   r   )rc   r)   r   r   )rt   r)   r   zTuple[str, bool, str])rt   r)   r   r   )r   r)   r   r   )__name__
__module____qualname____firstlineno__r*   __annotations__r,   r5   r@   rJ   rF   rG   ra   recompilerk   rm   classmethodr|   r   r   r   r   r   r   __static_attributes__ r=   r;   r&   r&   >   s    GS FC$ JQ	(( ( !G	(
 (. CB"++
, <<><	<0 &*	<,<, /<, #	<,
 
<,|)$$ .0ZZ-H*)+4F)G&4; 4;l)&##1r=   r&   c                     ^  \ rS rSr% SrSrS\S'   SrS\S'   \r	S\S	'   \	\
\/rS
\S'   S\S'   SrS\S'     S     SU 4S jjjr   S         SS jjr\4SS jjrSrU =r$ )r   iq  zA Beautiful soup `bs4.builder.TreeBuilder` that uses the
:py:class:`html.parser.HTMLParser` parser, found in the Python
standard library.

Fr   is_xmlT	picklabler)   NAMEzIterable[str]featuresz$Tuple[Iterable[Any], Dict[str, Any]]parser_argsTRACKS_LINE_NUMBERSc                   > [        5       nS H  nXS;   d  M
  UR                  U5      nXdU'   M!     [        [        U ]  " S0 UD6  U=(       d    / nU=(       d    0 nUR                  U5        SUS'   X4U l        g)aB  Constructor.

:param parser_args: Positional arguments to pass into
    the BeautifulSoupHTMLParser constructor, once it's
    invoked.
:param parser_kwargs: Keyword arguments to pass into
    the BeautifulSoupHTMLParser constructor, once it's
    invoked.
:param kwargs: Keyword arguments for the superclass constructor.
r-   Fconvert_charrefsNr   )dictpopsuperr   r5   updater   )r8   r   parser_kwargsr:   extra_parser_kwargsargrY   	__class__s          r;   r5   HTMLParserTreeBuilder.__init__  s    $ #f.C}

3+0C( / 	#T3=f=!'R%+01,1()'7r=   c              #  Z  #    [        U[        5      (       a	  USSS4v   g/ nU(       a  UR                  U5        / nU(       a  UR                  U5        [        UUUSUS9nUR                  c  [        S5      eUR                  UR                  UR                  UR                  4v   g7f)a  Run any preliminary steps necessary to make incoming markup
acceptable to the parser.

:param markup: Some markup -- probably a bytestring.
:param user_specified_encoding: The user asked to try this encoding.
:param document_declared_encoding: The markup itself claims to be
    in this encoding.
:param exclude_encodings: The user asked _not_ to try any of
    these encodings.

:yield: A series of 4-tuples: (markup, encoding, declared encoding,
     has undergone character replacement)

    Each 4-tuple represents a strategy for parsing the document.
    This TreeBuilder uses Unicode, Dammit to convert the markup
    into Unicode, so the ``markup`` element of the tuple will
    always be a string.
NFT)known_definite_encodingsuser_encodingsis_htmlexclude_encodingszPCould not convert input to Unicode, and html.parser will not accept bytestrings.)	
isinstancer)   rT   r   unicode_markupr   original_encodingdeclared_html_encodingr   )r8   markupuser_specified_encodingdocument_declared_encodingr   r   r   dammits           r;   prepare_markup$HTMLParserTreeBuilder.prepare_markup  s     2 fc""4u-- 57 "
 %++,CD*,% !!"<=%=)/
   ( 'b 
 %%((--66	 s   B)B+c                $   U R                   u  p4[        U[        5      (       d   eU R                  c   eU" U R                  /UQ70 UD6n UR	                  U5        UR                  5         / Ul        g! [         a  n[        U5      eSnAff = f)z
:param markup: The markup to feed into the parser.
:param _parser_class: An HTMLParser subclass to use. This is only intended for use in unit tests.
N)	r   r   r)   r/   feedcloseAssertionErrorr   r6   )r8   r   _parser_classr9   r:   parseres          r;   r   HTMLParserTreeBuilder.feed  s    
 '' &#&&&&
 yy$$$tyy:4:6:	*KKLLN /1+  	* 'q))		*s   !A5 5
B?B

B)r   )NN)r   zOptional[Iterable[Any]]r   zOptional[Dict[str, Any]]r:   r   )NNN)
r   r$   r   Optional[_Encoding]r   r   r   zOptional[_Encodings]r   zDIterable[Tuple[str, Optional[_Encoding], Optional[_Encoding], bool]])r   r$   r   ztype[BeautifulSoupHTMLParser]r   r   )r   r   r   r   __doc__r   r   r   
HTMLPARSERr   r   r   r   r   r5   r   r&   r   r   __classcell__)r   s   @r;   r   r   q  s     FDItD##T62Hm255 !%$ 04268,8 08 	8 8B 8<:>26FF "5F %8	F
 0F 
NFP Ul 1 1r=   )1r   
__future__r   __license____all__html.parserr   r   typingr   r   r	   r
   r   r   r   r   r   r   r   bs4.elementr   r   r   r   r   r   
bs4.dammitr   r   bs4.builderr   r   r   r   bs4.exceptionsr   bs4r    r!   bs4._typingr"   r#   r$   r   r)   rP   r&   r   r   r=   r;   <module>r      s    I "   # 	     9  0!+  
%tCH~sC&@$&FG p1j*@ p1f	T1O T1r=   