dgtlmoon 0c6931c07c WIP
2026-02-13 14:40:43 +01:00

556 lines
24 KiB
Python

import os
import uuid
from changedetectionio import strtobool
from .persistence import EntityPersistenceMixin, _determine_entity_type
__all__ = ['EntityPersistenceMixin', 'watch_base']
USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH = 'System default'
CONDITIONS_MATCH_LOGIC_DEFAULT = 'ALL'
class watch_base(dict):
"""
Base watch domain model (inherits from dict for backward compatibility).
WARNING: This class inherits from dict, which violates proper encapsulation.
Dict inheritance is legacy technical debt that should be refactored to a proper
domain model (e.g., Pydantic BaseModel) for better type safety and validation.
TODO: Migrate to Pydantic BaseModel for:
- Type safety and IDE autocomplete
- Automatic validation
- Clear separation between domain model and serialization
- Database backend abstraction (file → postgres → mongodb)
- Configuration override chain resolution (Watch → Tag → Global)
- Immutability options
- Better testing
- USE https://docs.pydantic.dev/latest/integrations/datamodel_code_generator TO BUILD THE MODEL FROM THE API-SPEC!!!
CHAIN RESOLUTION ARCHITECTURE:
The dream is a 3-level override hierarchy:
Watch settings → Tag/Group settings → Global settings
Current implementation: MANUAL resolution scattered across codebase
- Processors manually check watch.get('field')
- Loop through tags to find overrides_watch=True
- Fall back to datastore['settings']['application']['field']
Pydantic implementation: AUTOMATIC resolution via @computed_field
- Single source of truth for each setting's resolution logic
- Type-safe, testable, self-documenting
- Example: watch.resolved_fetch_backend (instead of nested dict navigation)
See: Watch.py model docstring for detailed Pydantic architecture plan
See: Tag.py model docstring for tag override explanation
See: processors/restock_diff/processor.py:184-192 for current manual example
Core Fields:
uuid (str): Unique identifier for this watch (auto-generated)
url (str): Target URL to monitor for changes
title (str|None): Custom display name (overrides page_title if set)
page_title (str|None): Title extracted from <title> tag of monitored page
tags (List[str]): List of tag UUIDs for categorization
tag (str): DEPRECATED - Old single-tag system, use tags instead
Check Configuration:
processor (str): Processor type ('text_json_diff', 'restock_diff', etc.)
fetch_backend (str): Fetcher to use ('system', 'html_requests', 'playwright', etc.)
method (str): HTTP method ('GET', 'POST', etc.)
headers (dict): Custom HTTP headers to send
proxy (str|None): Preferred proxy server
paused (bool): Whether change detection is paused
Scheduling:
time_between_check (dict): Check interval {'weeks': int, 'days': int, 'hours': int, 'minutes': int, 'seconds': int}
time_between_check_use_default (bool): Use global default interval if True
time_schedule_limit (dict): Weekly schedule limiting when checks can run
Structure: {
'enabled': bool,
'monday/tuesday/.../sunday': {
'enabled': bool,
'start_time': str ('HH:MM'),
'duration': {'hours': str, 'minutes': str}
}
}
Content Filtering:
include_filters (List[str]): CSS/XPath selectors to extract content
subtractive_selectors (List[str]): Selectors to remove from content
ignore_text (List[str]): Text patterns to ignore in change detection
trigger_text (List[str]): Text/regex that must be present to trigger change
text_should_not_be_present (List[str]): Text that should NOT be present
extract_text (List[str]): Regex patterns to extract specific text after filtering
Text Processing:
trim_text_whitespace (bool): Strip leading/trailing whitespace
sort_text_alphabetically (bool): Sort lines alphabetically before comparison
remove_duplicate_lines (bool): Remove duplicate lines
check_unique_lines (bool): Compare against all history for unique lines
strip_ignored_lines (bool|None): Remove lines matching ignore patterns
Change Detection Filters:
filter_text_added (bool): Include added text in change detection
filter_text_removed (bool): Include removed text in change detection
filter_text_replaced (bool): Include replaced text in change detection
Browser Automation:
browser_steps (List[dict]): Browser automation steps for JS-heavy sites
browser_steps_last_error_step (int|None): Last step that caused error
webdriver_delay (int|None): Seconds to wait after page load
webdriver_js_execute_code (str|None): JavaScript to execute before extraction
Restock Detection:
in_stock_only (bool): Only trigger on in-stock transitions
follow_price_changes (bool): Monitor price changes
has_ldjson_price_data (bool|None): Whether page has LD-JSON price data
track_ldjson_price_data (str|None): Track LD-JSON price data ('ACCEPT', 'REJECT', None)
price_change_threshold_percent (float|None): Minimum price change % to trigger
Notifications:
notification_urls (List[str]): Apprise URLs for notifications
notification_title (str|None): Custom notification title template
notification_body (str|None): Custom notification body template
notification_format (str): Notification format (e.g., 'System default', 'Text', 'HTML')
notification_muted (bool): Disable notifications for this watch
notification_screenshot (bool): Include screenshot in notifications
notification_alert_count (int): Number of notifications sent
last_notification_error (str|None): Last notification error message
body (str|None): DEPRECATED? Legacy notification body field
filter_failure_notification_send (bool): Send notification on filter failures
History & State:
date_created (int|None): Unix timestamp of watch creation
last_checked (int): Unix timestamp of last check
last_viewed (int): History snapshot key of last user view
last_error (str|bool): Last error message or False if no error
check_count (int): Total number of checks performed
fetch_time (float): Duration of last fetch in seconds
consecutive_filter_failures (int): Counter for consecutive filter match failures
previous_md5 (str|bool): MD5 hash of previous content
previous_md5_before_filters (str|bool): MD5 hash before filters applied
history_snapshot_max_length (int|None): Max history snapshots to keep (None = use global)
Conditions:
conditions (dict): Custom conditions for change detection logic
conditions_match_logic (str): Logic operator ('ALL', 'ANY') for conditions
Metadata:
content-type (str|None): Content-Type from last fetch
remote_server_reply (str|None): Server header from last response
ignore_status_codes (List[int]|None): HTTP status codes to ignore
use_page_title_in_list (bool|None): Display page title in watch list (None = use system default)
Instance Attributes (not serialized):
__datastore: Reference to parent DataStore (set externally after creation)
data_dir: Filesystem path for this watch's data directory
Notes:
- Many fields default to None to distinguish "not set" from "set to default"
- When field is None, system-level defaults are used
- Processor-specific configs (e.g., processor_config_*) are NOT stored in watch.json
They are stored in separate {processor_name}.json files
- This class is used for both Watch and Tag objects (tags reuse the structure)
"""
def __init__(self, *arg, **kw):
# Store datastore reference (common to Watch and Tag)
# Use single underscore to avoid name mangling issues in subclasses
self._datastore = kw.get('__datastore')
if kw.get('__datastore'):
del kw['__datastore']
# Store datastore_path (common to Watch and Tag)
self._datastore_path = kw.get('datastore_path')
if kw.get('datastore_path'):
del kw['datastore_path']
self.update({
# Custom notification content
# Re #110, so then if this is set to None, we know to use the default value instead
# Requires setting to None on submit if it's the same as the default
# Should be all None by default, so we use the system default in this case.
'body': None,
'browser_steps': [],
'browser_steps_last_error_step': None,
'conditions' : [],
'conditions_match_logic': CONDITIONS_MATCH_LOGIC_DEFAULT,
'check_count': 0,
'check_unique_lines': False, # On change-detected, compare against all history if its something new
'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
'content-type': None,
'date_created': None,
'extract_text': [], # Extract text by regex after filters
'fetch_backend': 'system', # plaintext, playwright etc
'fetch_time': 0.0,
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
'filter_text_added': True,
'filter_text_removed': True,
'filter_text_replaced': True,
'follow_price_changes': True,
'has_ldjson_price_data': None,
'history_snapshot_max_length': None,
'headers': {}, # Extra headers to send
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
'ignore_status_codes': None,
'in_stock_only': True, # Only trigger change on going to instock from out-of-stock
'include_filters': [],
'last_checked': 0,
'last_error': False,
'last_notification_error': None,
'last_viewed': 0, # history key value of the last viewed via the [diff] link
'method': 'GET',
'notification_alert_count': 0,
'notification_body': None,
'notification_format': USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH,
'notification_muted': False,
'notification_screenshot': False, # Include the latest screenshot if available and supported by the apprise URL
'notification_title': None,
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
'page_title': None, # <title> from the page
'paused': False,
'previous_md5': False,
'previous_md5_before_filters': False, # Used for skipping changedetection entirely
'processor': 'text_json_diff', # could be restock_diff or others from .processors
'price_change_threshold_percent': None,
'proxy': None, # Preferred proxy connection
'remote_server_reply': None, # From 'server' reply header
'sort_text_alphabetically': False,
'strip_ignored_lines': None,
'subtractive_selectors': [],
'tag': '', # Old system of text name for a tag, to be removed
'tags': [], # list of UUIDs to App.Tags
'text_should_not_be_present': [], # Text that should not present
'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
'time_between_check_use_default': True,
"time_schedule_limit": {
"enabled": False,
"monday": {
"enabled": True,
"start_time": "00:00",
"duration": {
"hours": "24",
"minutes": "00"
}
},
"tuesday": {
"enabled": True,
"start_time": "00:00",
"duration": {
"hours": "24",
"minutes": "00"
}
},
"wednesday": {
"enabled": True,
"start_time": "00:00",
"duration": {
"hours": "24",
"minutes": "00"
}
},
"thursday": {
"enabled": True,
"start_time": "00:00",
"duration": {
"hours": "24",
"minutes": "00"
}
},
"friday": {
"enabled": True,
"start_time": "00:00",
"duration": {
"hours": "24",
"minutes": "00"
}
},
"saturday": {
"enabled": True,
"start_time": "00:00",
"duration": {
"hours": "24",
"minutes": "00"
}
},
"sunday": {
"enabled": True,
"start_time": "00:00",
"duration": {
"hours": "24",
"minutes": "00"
}
},
},
'title': None, # An arbitrary field that overrides 'page_title'
'track_ldjson_price_data': None,
'trim_text_whitespace': False,
'remove_duplicate_lines': False,
'trigger_text': [], # List of text or regex to wait for until a change is detected
'url': '',
'use_page_title_in_list': None, # None = use system settings
'uuid': str(uuid.uuid4()),
'webdriver_delay': None,
'webdriver_js_execute_code': None, # Run before change-detection
})
super(watch_base, self).__init__(*arg, **kw)
if self.get('default'):
del self['default']
@classmethod
def get_property_names(cls):
"""
Get all @property attribute names from this model class using introspection.
This discovers computed/derived properties that are not stored in the datastore.
These properties should be filtered out during PUT/POST requests.
Returns:
frozenset: Immutable set of @property attribute names from the model class
"""
import functools
# Create a cached version if it doesn't exist
if not hasattr(cls, '_cached_get_property_names'):
@functools.cache
def _get_props():
properties = set()
# Use introspection to find all @property attributes
for name in dir(cls):
# Skip private/magic attributes
if name.startswith('_'):
continue
try:
attr = getattr(cls, name)
# Check if it's a property descriptor
if isinstance(attr, property):
properties.add(name)
except (AttributeError, TypeError):
continue
return frozenset(properties)
cls._cached_get_property_names = _get_props
return cls._cached_get_property_names()
def __deepcopy__(self, memo):
"""
Custom deepcopy for all watch_base subclasses (Watch, Tag, etc.).
CRITICAL FIX: Prevents copying large reference objects like __datastore
which would cause exponential memory growth when Watch objects are deepcopied.
This is called by:
- api/Watch.py:76 (API endpoint)
- api/Tags.py:28 (Tags API)
- processors/base.py:26 (EVERY processor run)
- store/__init__.py:544 (clone watch)
- And other locations
"""
from copy import deepcopy
# Create new instance without calling __init__
cls = self.__class__
new_obj = cls.__new__(cls)
memo[id(self)] = new_obj
# Copy the dict data (all the settings)
for key, value in self.items():
new_obj[key] = deepcopy(value, memo)
# Copy instance attributes dynamically
# This handles Watch-specific attrs (like __datastore) and any future subclass attrs
for attr_name in dir(self):
# Skip methods, special attrs, and dict keys
if attr_name.startswith('_') and not attr_name.startswith('__'):
# This catches _model__datastore, _model__history_n, etc.
try:
attr_value = getattr(self, attr_name)
# Special handling: Share references to large objects instead of copying
# Examples: _datastore, __datastore, __app_reference, __global_settings, etc.
if (attr_name == '_datastore' or
attr_name.endswith('__datastore') or
attr_name.endswith('__app')):
# Share the reference (don't copy!) to prevent memory leaks
setattr(new_obj, attr_name, attr_value)
# Skip cache attributes - let them regenerate on demand
elif 'cache' in attr_name.lower():
pass # Don't copy caches
# Copy regular instance attributes
elif not callable(attr_value):
setattr(new_obj, attr_name, attr_value)
except AttributeError:
pass # Attribute doesn't exist in this instance
return new_obj
def __getstate__(self):
"""
Custom pickle serialization for all watch_base subclasses.
Excludes large reference objects (like __datastore) from serialization.
"""
# Get the dict data
state = dict(self)
# Collect instance attributes (excluding methods and large references)
instance_attrs = {}
for attr_name in dir(self):
if attr_name.startswith('_') and not attr_name.startswith('__'):
try:
attr_value = getattr(self, attr_name)
# Exclude large reference objects and caches from serialization
if not (attr_name == '_datastore' or
attr_name.endswith('__datastore') or
attr_name.endswith('__app') or
'cache' in attr_name.lower() or
callable(attr_value)):
instance_attrs[attr_name] = attr_value
except AttributeError:
pass
if instance_attrs:
state['__instance_metadata__'] = instance_attrs
return state
def __setstate__(self, state):
"""
Custom pickle deserialization for all watch_base subclasses.
WARNING: Large reference objects (like __datastore) are NOT restored!
Caller must restore these references after unpickling if needed.
"""
# Extract metadata
metadata = state.pop('__instance_metadata__', {})
# Restore dict data
self.update(state)
# Restore instance attributes
for attr_name, attr_value in metadata.items():
setattr(self, attr_name, attr_value)
@property
def data_dir(self):
"""
The base directory for this watch/tag data (property, computed from UUID).
Common property for both Watch and Tag objects.
Returns path like: /datastore/{uuid}/
"""
return os.path.join(self._datastore_path, self['uuid']) if self._datastore_path else None
def ensure_data_dir_exists(self):
"""
Create the data directory if it doesn't exist.
Common method for both Watch and Tag objects.
"""
from loguru import logger
if not os.path.isdir(self.data_dir):
logger.debug(f"> Creating data dir {self.data_dir}")
os.mkdir(self.data_dir)
def get_global_setting(self, *path):
"""
Get a setting from the global datastore configuration.
Args:
*path: Path to the setting (e.g., 'application', 'history_snapshot_max_length')
Returns:
The setting value, or None if not found
Example:
maxlen = self.get_global_setting('application', 'history_snapshot_max_length')
"""
if not self._datastore:
return None
try:
value = self._datastore['settings']
for key in path:
value = value[key]
return value
except (KeyError, TypeError):
return None
def _get_commit_data(self):
"""
Prepare data for commit (can be overridden by subclasses).
Returns:
dict: Data to serialize (filtered as needed by subclass)
"""
import copy
# Acquire datastore lock to prevent concurrent modifications during copy
lock = self._datastore.lock if self._datastore and hasattr(self._datastore, 'lock') else None
if lock:
with lock:
snapshot = dict(self)
else:
snapshot = dict(self)
# Deep copy snapshot (slower, but done outside lock to minimize contention)
# Subclasses can override to filter keys (e.g., Watch excludes processor_config_*)
return {k: copy.deepcopy(v) for k, v in snapshot.items()}
def _save_to_disk(self, data_dict, uuid):
"""
Save data to disk (must be implemented by subclasses).
Args:
data_dict: Dictionary to save
uuid: UUID for logging
Raises:
NotImplementedError: If subclass doesn't implement
"""
raise NotImplementedError("Subclass must implement _save_to_disk()")
def commit(self):
"""
Save this watch/tag immediately to disk using atomic write.
Common commit logic for Watch and Tag objects.
Subclasses override _get_commit_data() and _save_to_disk() for specifics.
Fire-and-forget: Logs errors but does not raise exceptions.
Data remains in memory even if save fails, so next commit will retry.
"""
from loguru import logger
if not self.data_dir:
entity_type = self.__class__.__name__
logger.error(f"Cannot commit {entity_type} {self.get('uuid')} without datastore_path")
return
uuid = self.get('uuid')
if not uuid:
entity_type = self.__class__.__name__
logger.error(f"Cannot commit {entity_type} without UUID")
return
# Get data from subclass (may filter keys)
try:
data_dict = self._get_commit_data()
except Exception as e:
logger.error(f"Failed to prepare commit data for {uuid}: {e}")
return
# Save to disk via subclass implementation
try:
# Determine entity type from module name (Watch.py -> watch, Tag.py -> tag)
entity_type = _determine_entity_type(self.__class__)
filename = f"{entity_type}.json"
self._save_to_disk(data_dict, uuid)
logger.debug(f"Committed {entity_type} {uuid} to {uuid}/{filename}")
except Exception as e:
logger.error(f"Failed to commit {uuid}: {e}")