"""HTML parsing utility for extracting data from web responses.
**Example:**
.. code-block:: python
# Make a request to target
response = requests.get("http://target.com/login")
# Parse the HTML response
parser = HTMLParser.from_response(response)
# Find CSRF token for form submission
csrf_token = parser.find_csrf_token()
if csrf_token:
out.success(f"Found CSRF token: {csrf_token}")
"""
from bs4 import BeautifulSoup
from typing import List, Dict, Any, Optional, Union
from .output import out
[docs]
class HTMLParser:
def __init__(self, html: str):
self.soup = BeautifulSoup(html, 'html.parser')
self.html = html
[docs]
@classmethod
def from_response(cls, response):
"""Create parser from requests Response object.
Args:
response: A requests.Response object
Returns:
HTMLParser instance initialized with response text
"""
return cls(response.text)
[docs]
@classmethod
def from_file(cls, filepath: str):
"""Create parser from HTML file.
Args:
filepath: Path to HTML file
Returns:
HTMLParser instance initialized with file contents
"""
with open(filepath, 'r', encoding='utf-8') as f:
return cls(f.read())
[docs]
def find_by_id(self, element_id: str):
"""Find first element with given ID.
Args:
element_id: The ID attribute value to search for
Returns:
BeautifulSoup Tag object or None
"""
return self.soup.find(id=element_id)
[docs]
def find_all_by_id(self, element_id: str):
"""Find all elements with given ID (invalid HTML but sometimes happens).
Args:
element_id: The ID attribute value to search for
Returns:
List of BeautifulSoup Tag objects
"""
return self.soup.find_all(id=element_id)
[docs]
def find_by_class(self, class_name: str):
"""Find first element with given class name.
Args:
class_name: The CSS class to search for
Returns:
BeautifulSoup Tag object or None
"""
return self.soup.find(class_=class_name)
[docs]
def find_all_by_class(self, class_name: str):
"""Find all elements with given class name.
Args:
class_name: The CSS class to search for
Returns:
List of BeautifulSoup Tag objects
"""
return self.soup.find_all(class_=class_name)
[docs]
def find_by_tag(self, tag_name: str):
return self.soup.find(tag_name)
[docs]
def find_all_by_tag(self, tag_name: str):
return self.soup.find_all(tag_name)
[docs]
def find_by_name(self, name: str):
return self.soup.find(attrs={'name': name})
[docs]
def find_all_by_name(self, name: str):
return self.soup.find_all(attrs={'name': name})
[docs]
def find_by_attr(self, attr_name: str, attr_value: str):
return self.soup.find(attrs={attr_name: attr_value})
[docs]
def find_all_by_attr(self, attr_name: str, attr_value: str):
return self.soup.find_all(attrs={attr_name: attr_value})
[docs]
def find_links(self) -> List:
"""Find all links (anchor tags with href).
Returns:
List of anchor Tag objects with href attributes
"""
return self.soup.find_all('a', href=True)
[docs]
def find_scripts(self) -> List:
return self.soup.find_all('script')
[docs]
def search(self, text: str, tag: Optional[str] = None):
import re
if tag:
return self.soup.find_all(tag, string=re.compile(text, re.I))
else:
return self.soup.find_all(string=re.compile(text, re.I))
[docs]
def css_select(self, selector: str):
"""Select elements using CSS selector syntax.
Args:
selector: CSS selector string (e.g., 'div.class', '#id', 'form input[type="hidden"]')
Returns:
List of matching Tag objects
Example:
.. code-block:: python
# Find all hidden inputs
hidden = parser.css_select('input[type="hidden"]')
# Find all links in navigation
nav_links = parser.css_select('nav a')
"""
return self.soup.select(selector)
[docs]
def css_select_one(self, selector: str):
"""Select first element matching CSS selector.
Args:
selector: CSS selector string
Returns:
First matching Tag object or None
"""
return self.soup.select_one(selector)
[docs]
def get_title(self) -> Optional[str]:
title = self.soup.find('title')
return title.text if title else None
[docs]
def find_csrf_token(self) -> Optional[str]:
"""Find CSRF token in the HTML (checks common locations and names).
Searches for CSRF tokens in:
- Meta tags with common CSRF names
- Input fields with common CSRF names
- Hidden input fields containing 'csrf' or 'token'
Returns:
CSRF token value if found, None otherwise
Example:
.. code-block:: python
parser = HTMLParser.from_response(response)
csrf = parser.find_csrf_token()
if csrf:
form_data = {'csrf_token': csrf, 'username': 'admin'}
requests.post(url, data=form_data)
"""
common_names = [
'csrf_token', 'csrftoken', 'csrf', '_csrf', 'authenticity_token',
'csrfmiddlewaretoken', '__RequestVerificationToken', 'token',
'_token', 'csrf-token', 'CSRF-TOKEN', 'X-CSRF-Token'
]
# Check meta tags
for name in common_names:
meta = self.soup.find('meta', attrs={'name': name})
if meta and meta.get('content'):
return meta.get('content')
# Check input fields
for name in common_names:
input_field = self.soup.find('input', attrs={'name': name})
if input_field and input_field.get('value'):
return input_field.get('value')
# Check hidden inputs (broader search)
hidden_inputs = self.soup.find_all('input', attrs={'type': 'hidden'})
for inp in hidden_inputs:
name = inp.get('name', '').lower()
if 'csrf' in name or 'token' in name:
return inp.get('value')
return None
[docs]
def find_all_csrf_tokens(self) -> Dict[str, str]:
tokens = {}
# Meta tags
meta_tags = self.soup.find_all('meta')
for meta in meta_tags:
name = meta.get('name', '').lower()
if 'csrf' in name or 'token' in name:
content = meta.get('content')
if content:
tokens[f"meta[{meta.get('name')}]"] = content
# Input fields
inputs = self.soup.find_all('input')
for inp in inputs:
name = inp.get('name', '').lower()
if 'csrf' in name or 'token' in name:
value = inp.get('value')
if value:
tokens[f"input[{inp.get('name')}]"] = value
return tokens
[docs]
def dump_links(self):
"""Print all links found in the HTML (for crawling/mapping).
Example:
.. code-block:: python
parser.dump_links()
# Output:
# Home: /
# Admin Panel: /admin
# Login: /login
"""
links = self.find_links()
for link in links:
text = link.get_text(strip=True)
href = link.get('href')
if text:
print(f"{text}: {href}")
else:
print(href)
[docs]
def quick_parse(html: str) -> HTMLParser:
"""Quick helper to create parser from HTML string.
Args:
html: HTML content as string
Returns:
HTMLParser instance
"""
return HTMLParser(html)
[docs]
def parse_response(response) -> HTMLParser:
"""Quick helper to create parser from requests Response.
Args:
response: requests.Response object
Returns:
HTMLParser instance
Example:
.. code-block:: python
resp = requests.get("http://target.com")
parser = parse_response(resp)
csrf = parser.find_csrf_token()
"""
return HTMLParser.from_response(response)
[docs]
def parse_file(filepath: str) -> HTMLParser:
"""Quick helper to create parser from HTML file.
Args:
filepath: Path to HTML file
Returns:
HTMLParser instance
"""
return HTMLParser.from_file(filepath)