Source code for your_project.utils.html_parser

"""HTML parsing utility for extracting data from web responses.

**Example:**

.. code-block:: python

    # Make a request to target
    response = requests.get("http://target.com/login")

    # Parse the HTML response
    parser = HTMLParser.from_response(response)

    # Find CSRF token for form submission
    csrf_token = parser.find_csrf_token()
    if csrf_token:
        out.success(f"Found CSRF token: {csrf_token}")

"""

from bs4 import BeautifulSoup
from typing import List, Dict, Any, Optional, Union
from .output import out


[docs] class HTMLParser: def __init__(self, html: str): self.soup = BeautifulSoup(html, 'html.parser') self.html = html
[docs] @classmethod def from_response(cls, response): """Create parser from requests Response object. Args: response: A requests.Response object Returns: HTMLParser instance initialized with response text """ return cls(response.text)
[docs] @classmethod def from_file(cls, filepath: str): """Create parser from HTML file. Args: filepath: Path to HTML file Returns: HTMLParser instance initialized with file contents """ with open(filepath, 'r', encoding='utf-8') as f: return cls(f.read())
[docs] def find_by_id(self, element_id: str): """Find first element with given ID. Args: element_id: The ID attribute value to search for Returns: BeautifulSoup Tag object or None """ return self.soup.find(id=element_id)
[docs] def find_all_by_id(self, element_id: str): """Find all elements with given ID (invalid HTML but sometimes happens). Args: element_id: The ID attribute value to search for Returns: List of BeautifulSoup Tag objects """ return self.soup.find_all(id=element_id)
[docs] def find_by_class(self, class_name: str): """Find first element with given class name. Args: class_name: The CSS class to search for Returns: BeautifulSoup Tag object or None """ return self.soup.find(class_=class_name)
[docs] def find_all_by_class(self, class_name: str): """Find all elements with given class name. Args: class_name: The CSS class to search for Returns: List of BeautifulSoup Tag objects """ return self.soup.find_all(class_=class_name)
[docs] def find_by_tag(self, tag_name: str): return self.soup.find(tag_name)
[docs] def find_all_by_tag(self, tag_name: str): return self.soup.find_all(tag_name)
[docs] def find_by_name(self, name: str): return self.soup.find(attrs={'name': name})
[docs] def find_all_by_name(self, name: str): return self.soup.find_all(attrs={'name': name})
[docs] def find_by_attr(self, attr_name: str, attr_value: str): return self.soup.find(attrs={attr_name: attr_value})
[docs] def find_all_by_attr(self, attr_name: str, attr_value: str): return self.soup.find_all(attrs={attr_name: attr_value})
[docs] def find_forms(self) -> List: """Find all form elements in the HTML. Returns: List of form Tag objects """ return self.soup.find_all('form')
[docs] def find_inputs(self, form=None) -> List: """Find all input elements, optionally within a specific form. Args: form: Optional form element to search within Returns: List of input Tag objects """ search_in = form if form else self.soup return search_in.find_all('input')
[docs] def find_scripts(self) -> List: return self.soup.find_all('script')
[docs] def find_meta(self) -> List: return self.soup.find_all('meta')
[docs] def extract_text(self, element=None) -> str: target = element if element else self.soup return target.get_text(strip=True)
[docs] def extract_attrs(self, element) -> Dict: if hasattr(element, 'attrs'): return dict(element.attrs) return {}
[docs] def extract_form_data(self, form) -> Dict[str, Any]: """Extract all input data from a form element. Extracts names and values from input, textarea, and select elements, handling checkboxes, radio buttons, and default values properly. Args: form: BeautifulSoup form Tag object Returns: Dict mapping input names to their values Example: .. code-block:: python form = parser.find_forms()[0] data = parser.extract_form_data(form) data['username'] = 'admin' # Update with your values requests.post(url, data=data) """ data = {} inputs = form.find_all(['input', 'textarea', 'select']) for inp in inputs: name = inp.get('name') if not name: continue if inp.name == 'input': if inp.get('type') == 'checkbox': if inp.get('checked'): data[name] = inp.get('value', 'on') elif inp.get('type') == 'radio': if inp.get('checked'): data[name] = inp.get('value') else: data[name] = inp.get('value', '') elif inp.name == 'textarea': data[name] = inp.text elif inp.name == 'select': option = inp.find('option', selected=True) if option: data[name] = option.get('value', option.text) else: first_option = inp.find('option') if first_option: data[name] = first_option.get('value', first_option.text) return data
[docs] def search(self, text: str, tag: Optional[str] = None): import re if tag: return self.soup.find_all(tag, string=re.compile(text, re.I)) else: return self.soup.find_all(string=re.compile(text, re.I))
[docs] def css_select(self, selector: str): """Select elements using CSS selector syntax. Args: selector: CSS selector string (e.g., 'div.class', '#id', 'form input[type="hidden"]') Returns: List of matching Tag objects Example: .. code-block:: python # Find all hidden inputs hidden = parser.css_select('input[type="hidden"]') # Find all links in navigation nav_links = parser.css_select('nav a') """ return self.soup.select(selector)
[docs] def css_select_one(self, selector: str): """Select first element matching CSS selector. Args: selector: CSS selector string Returns: First matching Tag object or None """ return self.soup.select_one(selector)
[docs] def get_title(self) -> Optional[str]: title = self.soup.find('title') return title.text if title else None
[docs] def get_headers(self) -> Dict[str, List[str]]: headers = {} for i in range(1, 7): h_tags = self.soup.find_all(f'h{i}') if h_tags: headers[f'h{i}'] = [h.get_text(strip=True) for h in h_tags] return headers
[docs] def find_csrf_token(self) -> Optional[str]: """Find CSRF token in the HTML (checks common locations and names). Searches for CSRF tokens in: - Meta tags with common CSRF names - Input fields with common CSRF names - Hidden input fields containing 'csrf' or 'token' Returns: CSRF token value if found, None otherwise Example: .. code-block:: python parser = HTMLParser.from_response(response) csrf = parser.find_csrf_token() if csrf: form_data = {'csrf_token': csrf, 'username': 'admin'} requests.post(url, data=form_data) """ common_names = [ 'csrf_token', 'csrftoken', 'csrf', '_csrf', 'authenticity_token', 'csrfmiddlewaretoken', '__RequestVerificationToken', 'token', '_token', 'csrf-token', 'CSRF-TOKEN', 'X-CSRF-Token' ] # Check meta tags for name in common_names: meta = self.soup.find('meta', attrs={'name': name}) if meta and meta.get('content'): return meta.get('content') # Check input fields for name in common_names: input_field = self.soup.find('input', attrs={'name': name}) if input_field and input_field.get('value'): return input_field.get('value') # Check hidden inputs (broader search) hidden_inputs = self.soup.find_all('input', attrs={'type': 'hidden'}) for inp in hidden_inputs: name = inp.get('name', '').lower() if 'csrf' in name or 'token' in name: return inp.get('value') return None
[docs] def find_all_csrf_tokens(self) -> Dict[str, str]: tokens = {} # Meta tags meta_tags = self.soup.find_all('meta') for meta in meta_tags: name = meta.get('name', '').lower() if 'csrf' in name or 'token' in name: content = meta.get('content') if content: tokens[f"meta[{meta.get('name')}]"] = content # Input fields inputs = self.soup.find_all('input') for inp in inputs: name = inp.get('name', '').lower() if 'csrf' in name or 'token' in name: value = inp.get('value') if value: tokens[f"input[{inp.get('name')}]"] = value return tokens
[docs] def dump_forms(self): """Print all forms with their inputs and values (for debugging). Useful for quick reconnaissance of form structures and hidden fields. Example: .. code-block:: python # Quick form analysis parser = HTMLParser.from_response(response) parser.dump_forms() # Output: # Form 1: # Action: /login # Method: POST # username: # password: # csrf_token: abc123... """ forms = self.find_forms() for i, form in enumerate(forms): out.info(f"Form {i+1}:") print(f" Action: {form.get('action', 'N/A')}") print(f" Method: {form.get('method', 'GET')}") data = self.extract_form_data(form) for name, value in data.items(): print(f" {name}: {value}") print()
[docs] def quick_parse(html: str) -> HTMLParser: """Quick helper to create parser from HTML string. Args: html: HTML content as string Returns: HTMLParser instance """ return HTMLParser(html)
[docs] def parse_response(response) -> HTMLParser: """Quick helper to create parser from requests Response. Args: response: requests.Response object Returns: HTMLParser instance Example: .. code-block:: python resp = requests.get("http://target.com") parser = parse_response(resp) csrf = parser.find_csrf_token() """ return HTMLParser.from_response(response)
[docs] def parse_file(filepath: str) -> HTMLParser: """Quick helper to create parser from HTML file. Args: filepath: Path to HTML file Returns: HTMLParser instance """ return HTMLParser.from_file(filepath)