mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-02-15 00:43:36 -06:00
556 lines
24 KiB
Python
556 lines
24 KiB
Python
import os
|
|
import uuid
|
|
|
|
from changedetectionio import strtobool
|
|
from .persistence import EntityPersistenceMixin, _determine_entity_type
|
|
|
|
__all__ = ['EntityPersistenceMixin', 'watch_base']
|
|
|
|
USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH = 'System default'
|
|
CONDITIONS_MATCH_LOGIC_DEFAULT = 'ALL'
|
|
|
|
|
|
class watch_base(dict):
|
|
"""
|
|
Base watch domain model (inherits from dict for backward compatibility).
|
|
|
|
WARNING: This class inherits from dict, which violates proper encapsulation.
|
|
Dict inheritance is legacy technical debt that should be refactored to a proper
|
|
domain model (e.g., Pydantic BaseModel) for better type safety and validation.
|
|
|
|
TODO: Migrate to Pydantic BaseModel for:
|
|
- Type safety and IDE autocomplete
|
|
- Automatic validation
|
|
- Clear separation between domain model and serialization
|
|
- Database backend abstraction (file → postgres → mongodb)
|
|
- Configuration override chain resolution (Watch → Tag → Global)
|
|
- Immutability options
|
|
- Better testing
|
|
- USE https://docs.pydantic.dev/latest/integrations/datamodel_code_generator TO BUILD THE MODEL FROM THE API-SPEC!!!
|
|
|
|
CHAIN RESOLUTION ARCHITECTURE:
|
|
The dream is a 3-level override hierarchy:
|
|
Watch settings → Tag/Group settings → Global settings
|
|
|
|
Current implementation: MANUAL resolution scattered across codebase
|
|
- Processors manually check watch.get('field')
|
|
- Loop through tags to find overrides_watch=True
|
|
- Fall back to datastore['settings']['application']['field']
|
|
|
|
Pydantic implementation: AUTOMATIC resolution via @computed_field
|
|
- Single source of truth for each setting's resolution logic
|
|
- Type-safe, testable, self-documenting
|
|
- Example: watch.resolved_fetch_backend (instead of nested dict navigation)
|
|
|
|
See: Watch.py model docstring for detailed Pydantic architecture plan
|
|
See: Tag.py model docstring for tag override explanation
|
|
See: processors/restock_diff/processor.py:184-192 for current manual example
|
|
|
|
Core Fields:
|
|
uuid (str): Unique identifier for this watch (auto-generated)
|
|
url (str): Target URL to monitor for changes
|
|
title (str|None): Custom display name (overrides page_title if set)
|
|
page_title (str|None): Title extracted from <title> tag of monitored page
|
|
tags (List[str]): List of tag UUIDs for categorization
|
|
tag (str): DEPRECATED - Old single-tag system, use tags instead
|
|
|
|
Check Configuration:
|
|
processor (str): Processor type ('text_json_diff', 'restock_diff', etc.)
|
|
fetch_backend (str): Fetcher to use ('system', 'html_requests', 'playwright', etc.)
|
|
method (str): HTTP method ('GET', 'POST', etc.)
|
|
headers (dict): Custom HTTP headers to send
|
|
proxy (str|None): Preferred proxy server
|
|
paused (bool): Whether change detection is paused
|
|
|
|
Scheduling:
|
|
time_between_check (dict): Check interval {'weeks': int, 'days': int, 'hours': int, 'minutes': int, 'seconds': int}
|
|
time_between_check_use_default (bool): Use global default interval if True
|
|
time_schedule_limit (dict): Weekly schedule limiting when checks can run
|
|
Structure: {
|
|
'enabled': bool,
|
|
'monday/tuesday/.../sunday': {
|
|
'enabled': bool,
|
|
'start_time': str ('HH:MM'),
|
|
'duration': {'hours': str, 'minutes': str}
|
|
}
|
|
}
|
|
|
|
Content Filtering:
|
|
include_filters (List[str]): CSS/XPath selectors to extract content
|
|
subtractive_selectors (List[str]): Selectors to remove from content
|
|
ignore_text (List[str]): Text patterns to ignore in change detection
|
|
trigger_text (List[str]): Text/regex that must be present to trigger change
|
|
text_should_not_be_present (List[str]): Text that should NOT be present
|
|
extract_text (List[str]): Regex patterns to extract specific text after filtering
|
|
|
|
Text Processing:
|
|
trim_text_whitespace (bool): Strip leading/trailing whitespace
|
|
sort_text_alphabetically (bool): Sort lines alphabetically before comparison
|
|
remove_duplicate_lines (bool): Remove duplicate lines
|
|
check_unique_lines (bool): Compare against all history for unique lines
|
|
strip_ignored_lines (bool|None): Remove lines matching ignore patterns
|
|
|
|
Change Detection Filters:
|
|
filter_text_added (bool): Include added text in change detection
|
|
filter_text_removed (bool): Include removed text in change detection
|
|
filter_text_replaced (bool): Include replaced text in change detection
|
|
|
|
Browser Automation:
|
|
browser_steps (List[dict]): Browser automation steps for JS-heavy sites
|
|
browser_steps_last_error_step (int|None): Last step that caused error
|
|
webdriver_delay (int|None): Seconds to wait after page load
|
|
webdriver_js_execute_code (str|None): JavaScript to execute before extraction
|
|
|
|
Restock Detection:
|
|
in_stock_only (bool): Only trigger on in-stock transitions
|
|
follow_price_changes (bool): Monitor price changes
|
|
has_ldjson_price_data (bool|None): Whether page has LD-JSON price data
|
|
track_ldjson_price_data (str|None): Track LD-JSON price data ('ACCEPT', 'REJECT', None)
|
|
price_change_threshold_percent (float|None): Minimum price change % to trigger
|
|
|
|
Notifications:
|
|
notification_urls (List[str]): Apprise URLs for notifications
|
|
notification_title (str|None): Custom notification title template
|
|
notification_body (str|None): Custom notification body template
|
|
notification_format (str): Notification format (e.g., 'System default', 'Text', 'HTML')
|
|
notification_muted (bool): Disable notifications for this watch
|
|
notification_screenshot (bool): Include screenshot in notifications
|
|
notification_alert_count (int): Number of notifications sent
|
|
last_notification_error (str|None): Last notification error message
|
|
body (str|None): DEPRECATED? Legacy notification body field
|
|
filter_failure_notification_send (bool): Send notification on filter failures
|
|
|
|
History & State:
|
|
date_created (int|None): Unix timestamp of watch creation
|
|
last_checked (int): Unix timestamp of last check
|
|
last_viewed (int): History snapshot key of last user view
|
|
last_error (str|bool): Last error message or False if no error
|
|
check_count (int): Total number of checks performed
|
|
fetch_time (float): Duration of last fetch in seconds
|
|
consecutive_filter_failures (int): Counter for consecutive filter match failures
|
|
previous_md5 (str|bool): MD5 hash of previous content
|
|
previous_md5_before_filters (str|bool): MD5 hash before filters applied
|
|
history_snapshot_max_length (int|None): Max history snapshots to keep (None = use global)
|
|
|
|
Conditions:
|
|
conditions (dict): Custom conditions for change detection logic
|
|
conditions_match_logic (str): Logic operator ('ALL', 'ANY') for conditions
|
|
|
|
Metadata:
|
|
content-type (str|None): Content-Type from last fetch
|
|
remote_server_reply (str|None): Server header from last response
|
|
ignore_status_codes (List[int]|None): HTTP status codes to ignore
|
|
use_page_title_in_list (bool|None): Display page title in watch list (None = use system default)
|
|
|
|
Instance Attributes (not serialized):
|
|
__datastore: Reference to parent DataStore (set externally after creation)
|
|
data_dir: Filesystem path for this watch's data directory
|
|
|
|
Notes:
|
|
- Many fields default to None to distinguish "not set" from "set to default"
|
|
- When field is None, system-level defaults are used
|
|
- Processor-specific configs (e.g., processor_config_*) are NOT stored in watch.json
|
|
They are stored in separate {processor_name}.json files
|
|
- This class is used for both Watch and Tag objects (tags reuse the structure)
|
|
"""
|
|
|
|
def __init__(self, *arg, **kw):
|
|
# Store datastore reference (common to Watch and Tag)
|
|
# Use single underscore to avoid name mangling issues in subclasses
|
|
self._datastore = kw.get('__datastore')
|
|
if kw.get('__datastore'):
|
|
del kw['__datastore']
|
|
|
|
# Store datastore_path (common to Watch and Tag)
|
|
self._datastore_path = kw.get('datastore_path')
|
|
if kw.get('datastore_path'):
|
|
del kw['datastore_path']
|
|
|
|
self.update({
|
|
# Custom notification content
|
|
# Re #110, so then if this is set to None, we know to use the default value instead
|
|
# Requires setting to None on submit if it's the same as the default
|
|
# Should be all None by default, so we use the system default in this case.
|
|
'body': None,
|
|
'browser_steps': [],
|
|
'browser_steps_last_error_step': None,
|
|
'conditions' : [],
|
|
'conditions_match_logic': CONDITIONS_MATCH_LOGIC_DEFAULT,
|
|
'check_count': 0,
|
|
'check_unique_lines': False, # On change-detected, compare against all history if its something new
|
|
'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
|
|
'content-type': None,
|
|
'date_created': None,
|
|
'extract_text': [], # Extract text by regex after filters
|
|
'fetch_backend': 'system', # plaintext, playwright etc
|
|
'fetch_time': 0.0,
|
|
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
|
|
'filter_text_added': True,
|
|
'filter_text_removed': True,
|
|
'filter_text_replaced': True,
|
|
'follow_price_changes': True,
|
|
'has_ldjson_price_data': None,
|
|
'history_snapshot_max_length': None,
|
|
'headers': {}, # Extra headers to send
|
|
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
|
|
'ignore_status_codes': None,
|
|
'in_stock_only': True, # Only trigger change on going to instock from out-of-stock
|
|
'include_filters': [],
|
|
'last_checked': 0,
|
|
'last_error': False,
|
|
'last_notification_error': None,
|
|
'last_viewed': 0, # history key value of the last viewed via the [diff] link
|
|
'method': 'GET',
|
|
'notification_alert_count': 0,
|
|
'notification_body': None,
|
|
'notification_format': USE_SYSTEM_DEFAULT_NOTIFICATION_FORMAT_FOR_WATCH,
|
|
'notification_muted': False,
|
|
'notification_screenshot': False, # Include the latest screenshot if available and supported by the apprise URL
|
|
'notification_title': None,
|
|
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
|
|
'page_title': None, # <title> from the page
|
|
'paused': False,
|
|
'previous_md5': False,
|
|
'previous_md5_before_filters': False, # Used for skipping changedetection entirely
|
|
'processor': 'text_json_diff', # could be restock_diff or others from .processors
|
|
'price_change_threshold_percent': None,
|
|
'proxy': None, # Preferred proxy connection
|
|
'remote_server_reply': None, # From 'server' reply header
|
|
'sort_text_alphabetically': False,
|
|
'strip_ignored_lines': None,
|
|
'subtractive_selectors': [],
|
|
'tag': '', # Old system of text name for a tag, to be removed
|
|
'tags': [], # list of UUIDs to App.Tags
|
|
'text_should_not_be_present': [], # Text that should not present
|
|
'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
|
|
'time_between_check_use_default': True,
|
|
"time_schedule_limit": {
|
|
"enabled": False,
|
|
"monday": {
|
|
"enabled": True,
|
|
"start_time": "00:00",
|
|
"duration": {
|
|
"hours": "24",
|
|
"minutes": "00"
|
|
}
|
|
},
|
|
"tuesday": {
|
|
"enabled": True,
|
|
"start_time": "00:00",
|
|
"duration": {
|
|
"hours": "24",
|
|
"minutes": "00"
|
|
}
|
|
},
|
|
"wednesday": {
|
|
"enabled": True,
|
|
"start_time": "00:00",
|
|
"duration": {
|
|
"hours": "24",
|
|
"minutes": "00"
|
|
}
|
|
},
|
|
"thursday": {
|
|
"enabled": True,
|
|
"start_time": "00:00",
|
|
"duration": {
|
|
"hours": "24",
|
|
"minutes": "00"
|
|
}
|
|
},
|
|
"friday": {
|
|
"enabled": True,
|
|
"start_time": "00:00",
|
|
"duration": {
|
|
"hours": "24",
|
|
"minutes": "00"
|
|
}
|
|
},
|
|
"saturday": {
|
|
"enabled": True,
|
|
"start_time": "00:00",
|
|
"duration": {
|
|
"hours": "24",
|
|
"minutes": "00"
|
|
}
|
|
},
|
|
"sunday": {
|
|
"enabled": True,
|
|
"start_time": "00:00",
|
|
"duration": {
|
|
"hours": "24",
|
|
"minutes": "00"
|
|
}
|
|
},
|
|
},
|
|
'title': None, # An arbitrary field that overrides 'page_title'
|
|
'track_ldjson_price_data': None,
|
|
'trim_text_whitespace': False,
|
|
'remove_duplicate_lines': False,
|
|
'trigger_text': [], # List of text or regex to wait for until a change is detected
|
|
'url': '',
|
|
'use_page_title_in_list': None, # None = use system settings
|
|
'uuid': str(uuid.uuid4()),
|
|
'webdriver_delay': None,
|
|
'webdriver_js_execute_code': None, # Run before change-detection
|
|
})
|
|
|
|
super(watch_base, self).__init__(*arg, **kw)
|
|
|
|
if self.get('default'):
|
|
del self['default']
|
|
|
|
@classmethod
|
|
def get_property_names(cls):
|
|
"""
|
|
Get all @property attribute names from this model class using introspection.
|
|
|
|
This discovers computed/derived properties that are not stored in the datastore.
|
|
These properties should be filtered out during PUT/POST requests.
|
|
|
|
Returns:
|
|
frozenset: Immutable set of @property attribute names from the model class
|
|
"""
|
|
import functools
|
|
|
|
# Create a cached version if it doesn't exist
|
|
if not hasattr(cls, '_cached_get_property_names'):
|
|
@functools.cache
|
|
def _get_props():
|
|
properties = set()
|
|
# Use introspection to find all @property attributes
|
|
for name in dir(cls):
|
|
# Skip private/magic attributes
|
|
if name.startswith('_'):
|
|
continue
|
|
try:
|
|
attr = getattr(cls, name)
|
|
# Check if it's a property descriptor
|
|
if isinstance(attr, property):
|
|
properties.add(name)
|
|
except (AttributeError, TypeError):
|
|
continue
|
|
return frozenset(properties)
|
|
|
|
cls._cached_get_property_names = _get_props
|
|
|
|
return cls._cached_get_property_names()
|
|
|
|
def __deepcopy__(self, memo):
|
|
"""
|
|
Custom deepcopy for all watch_base subclasses (Watch, Tag, etc.).
|
|
|
|
CRITICAL FIX: Prevents copying large reference objects like __datastore
|
|
which would cause exponential memory growth when Watch objects are deepcopied.
|
|
|
|
This is called by:
|
|
- api/Watch.py:76 (API endpoint)
|
|
- api/Tags.py:28 (Tags API)
|
|
- processors/base.py:26 (EVERY processor run)
|
|
- store/__init__.py:544 (clone watch)
|
|
- And other locations
|
|
"""
|
|
from copy import deepcopy
|
|
|
|
# Create new instance without calling __init__
|
|
cls = self.__class__
|
|
new_obj = cls.__new__(cls)
|
|
memo[id(self)] = new_obj
|
|
|
|
# Copy the dict data (all the settings)
|
|
for key, value in self.items():
|
|
new_obj[key] = deepcopy(value, memo)
|
|
|
|
# Copy instance attributes dynamically
|
|
# This handles Watch-specific attrs (like __datastore) and any future subclass attrs
|
|
for attr_name in dir(self):
|
|
# Skip methods, special attrs, and dict keys
|
|
if attr_name.startswith('_') and not attr_name.startswith('__'):
|
|
# This catches _model__datastore, _model__history_n, etc.
|
|
try:
|
|
attr_value = getattr(self, attr_name)
|
|
|
|
# Special handling: Share references to large objects instead of copying
|
|
# Examples: _datastore, __datastore, __app_reference, __global_settings, etc.
|
|
if (attr_name == '_datastore' or
|
|
attr_name.endswith('__datastore') or
|
|
attr_name.endswith('__app')):
|
|
# Share the reference (don't copy!) to prevent memory leaks
|
|
setattr(new_obj, attr_name, attr_value)
|
|
# Skip cache attributes - let them regenerate on demand
|
|
elif 'cache' in attr_name.lower():
|
|
pass # Don't copy caches
|
|
# Copy regular instance attributes
|
|
elif not callable(attr_value):
|
|
setattr(new_obj, attr_name, attr_value)
|
|
except AttributeError:
|
|
pass # Attribute doesn't exist in this instance
|
|
|
|
return new_obj
|
|
|
|
def __getstate__(self):
|
|
"""
|
|
Custom pickle serialization for all watch_base subclasses.
|
|
|
|
Excludes large reference objects (like __datastore) from serialization.
|
|
"""
|
|
# Get the dict data
|
|
state = dict(self)
|
|
|
|
# Collect instance attributes (excluding methods and large references)
|
|
instance_attrs = {}
|
|
for attr_name in dir(self):
|
|
if attr_name.startswith('_') and not attr_name.startswith('__'):
|
|
try:
|
|
attr_value = getattr(self, attr_name)
|
|
# Exclude large reference objects and caches from serialization
|
|
if not (attr_name == '_datastore' or
|
|
attr_name.endswith('__datastore') or
|
|
attr_name.endswith('__app') or
|
|
'cache' in attr_name.lower() or
|
|
callable(attr_value)):
|
|
instance_attrs[attr_name] = attr_value
|
|
except AttributeError:
|
|
pass
|
|
|
|
if instance_attrs:
|
|
state['__instance_metadata__'] = instance_attrs
|
|
|
|
return state
|
|
|
|
def __setstate__(self, state):
|
|
"""
|
|
Custom pickle deserialization for all watch_base subclasses.
|
|
|
|
WARNING: Large reference objects (like __datastore) are NOT restored!
|
|
Caller must restore these references after unpickling if needed.
|
|
"""
|
|
# Extract metadata
|
|
metadata = state.pop('__instance_metadata__', {})
|
|
|
|
# Restore dict data
|
|
self.update(state)
|
|
|
|
# Restore instance attributes
|
|
for attr_name, attr_value in metadata.items():
|
|
setattr(self, attr_name, attr_value)
|
|
|
|
@property
|
|
def data_dir(self):
|
|
"""
|
|
The base directory for this watch/tag data (property, computed from UUID).
|
|
|
|
Common property for both Watch and Tag objects.
|
|
Returns path like: /datastore/{uuid}/
|
|
"""
|
|
return os.path.join(self._datastore_path, self['uuid']) if self._datastore_path else None
|
|
|
|
def ensure_data_dir_exists(self):
|
|
"""
|
|
Create the data directory if it doesn't exist.
|
|
|
|
Common method for both Watch and Tag objects.
|
|
"""
|
|
from loguru import logger
|
|
if not os.path.isdir(self.data_dir):
|
|
logger.debug(f"> Creating data dir {self.data_dir}")
|
|
os.mkdir(self.data_dir)
|
|
|
|
def get_global_setting(self, *path):
|
|
"""
|
|
Get a setting from the global datastore configuration.
|
|
|
|
Args:
|
|
*path: Path to the setting (e.g., 'application', 'history_snapshot_max_length')
|
|
|
|
Returns:
|
|
The setting value, or None if not found
|
|
|
|
Example:
|
|
maxlen = self.get_global_setting('application', 'history_snapshot_max_length')
|
|
"""
|
|
if not self._datastore:
|
|
return None
|
|
|
|
try:
|
|
value = self._datastore['settings']
|
|
for key in path:
|
|
value = value[key]
|
|
return value
|
|
except (KeyError, TypeError):
|
|
return None
|
|
|
|
def _get_commit_data(self):
|
|
"""
|
|
Prepare data for commit (can be overridden by subclasses).
|
|
|
|
Returns:
|
|
dict: Data to serialize (filtered as needed by subclass)
|
|
"""
|
|
import copy
|
|
|
|
# Acquire datastore lock to prevent concurrent modifications during copy
|
|
lock = self._datastore.lock if self._datastore and hasattr(self._datastore, 'lock') else None
|
|
|
|
if lock:
|
|
with lock:
|
|
snapshot = dict(self)
|
|
else:
|
|
snapshot = dict(self)
|
|
|
|
# Deep copy snapshot (slower, but done outside lock to minimize contention)
|
|
# Subclasses can override to filter keys (e.g., Watch excludes processor_config_*)
|
|
return {k: copy.deepcopy(v) for k, v in snapshot.items()}
|
|
|
|
def _save_to_disk(self, data_dict, uuid):
|
|
"""
|
|
Save data to disk (must be implemented by subclasses).
|
|
|
|
Args:
|
|
data_dict: Dictionary to save
|
|
uuid: UUID for logging
|
|
|
|
Raises:
|
|
NotImplementedError: If subclass doesn't implement
|
|
"""
|
|
raise NotImplementedError("Subclass must implement _save_to_disk()")
|
|
|
|
def commit(self):
|
|
"""
|
|
Save this watch/tag immediately to disk using atomic write.
|
|
|
|
Common commit logic for Watch and Tag objects.
|
|
Subclasses override _get_commit_data() and _save_to_disk() for specifics.
|
|
|
|
Fire-and-forget: Logs errors but does not raise exceptions.
|
|
Data remains in memory even if save fails, so next commit will retry.
|
|
"""
|
|
from loguru import logger
|
|
|
|
if not self.data_dir:
|
|
entity_type = self.__class__.__name__
|
|
logger.error(f"Cannot commit {entity_type} {self.get('uuid')} without datastore_path")
|
|
return
|
|
|
|
uuid = self.get('uuid')
|
|
if not uuid:
|
|
entity_type = self.__class__.__name__
|
|
logger.error(f"Cannot commit {entity_type} without UUID")
|
|
return
|
|
|
|
# Get data from subclass (may filter keys)
|
|
try:
|
|
data_dict = self._get_commit_data()
|
|
except Exception as e:
|
|
logger.error(f"Failed to prepare commit data for {uuid}: {e}")
|
|
return
|
|
|
|
# Save to disk via subclass implementation
|
|
try:
|
|
# Determine entity type from module name (Watch.py -> watch, Tag.py -> tag)
|
|
entity_type = _determine_entity_type(self.__class__)
|
|
filename = f"{entity_type}.json"
|
|
self._save_to_disk(data_dict, uuid)
|
|
logger.debug(f"Committed {entity_type} {uuid} to {uuid}/{filename}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to commit {uuid}: {e}") |