android/.github/scripts/release-notes/process_release_notes.py
2025-11-27 22:55:06 +00:00

270 lines
9.3 KiB
Python

import re
import sys
import subprocess
import json
import argparse
from typing import List, Tuple, Dict
def extract_jira_tickets(line: str) -> List[str]:
"""Find all Jira tickets in format ABC-123 (with any prefix/suffix)"""
return re.findall(r'[A-Z]+-\d+', line)
def extract_pr_numbers(line: str) -> List[str]:
"""Match PR numbers from GitHub format (#123)"""
return re.findall(r'#(\d+)', line)
def extract_pr_url(line: str) -> str:
"""Match PR URL from GitHub format https://github.com/foo/bar/pull/123
Returns:
The first PR URL found in the line, or empty string if no URL is found
"""
matches = re.findall(r'https://github\.com/[\w-]+/[\w.-]+/pull/\d+', line)
return matches[0] if matches else ""
def extract_pr_number_from_url(pr_url: str) -> str:
"""Extract PR number from a GitHub PR URL.
Args:
pr_url: GitHub PR URL (e.g., https://github.com/foo/bar/pull/123)
Returns:
PR number as string, or empty string if not found
"""
match = re.search(r'/pull/(\d+)', pr_url)
return match.group(1) if match else ""
def precache_pr_labels(limit: int = 500) -> Dict[str, List[str]]:
"""Fetch the last N PRs and cache their labels in a map.
Args:
limit: Number of PRs to fetch (default: 500)
Returns:
Dictionary mapping PR number to list of label names
"""
print(f"Pre-caching labels for last {limit} PRs...")
result = subprocess.run(
['gh', 'pr', 'list', '--state', 'merged', '--json', 'number,labels', '--limit', str(limit)],
capture_output=True,
text=True,
check=True
)
pr_cache = {}
prs = json.loads(result.stdout)
for pr in prs:
pr_number = str(pr['number'])
labels = [label['name'] for label in pr.get('labels', [])]
pr_cache[pr_number] = labels
print(f"Cached {len(pr_cache)} PRs")
return pr_cache
def fetch_labels(github_pr_url: str) -> List[str]:
"""Fetch labels from a GitHub PR using the GitHub CLI."""
result = subprocess.run(
['gh', 'pr', 'view', github_pr_url, '--json', 'labels', '--jq', '.labels[].name'],
capture_output=True,
text=True,
check=True
)
return [label.strip() for label in result.stdout.strip().split('\n') if label.strip()]
def should_skip_pr(release_app_label: str, pr_labels: List[str]) -> bool:
"""Check if the PR should be skipped based on app labels.
Skip if there's at least one label that starts with "app:" but release_app_label isn't found.
Args:
release_app_label: The app label to look for (e.g., "app:password-manager")
pr_labels: List of labels from the PR
Returns:
True if the PR should be skipped, False otherwise
"""
pr_app_labels = [label for label in pr_labels if label.startswith('app:')]
# Skip if there are app labels but release_app_label is not among them
return len(pr_app_labels) > 0 and release_app_label not in pr_app_labels
def process_line(line: str) -> str:
"""Process a single line from release notes by removing Jira tickets, conventional commit prefixes and other common patterns.
Args:
line: A single line from release notes
Returns:
Processed line with tickets and prefixes removed
Example:
>>> process_line("[ABC-123] feat(ui): Add new button")
"Add new button"
"""
original = line
# Remove Jira ticket patterns:
line = re.sub(r'\[[A-Z]+-\d+\]', '', line) # [ABC-123] -> ""
line = re.sub(r'[A-Z]+-\d+:\s', '', line) # ABC-123: -> ""
line = re.sub(r'[A-Z]+-\d+\s-\s', '', line) # ABC-123 - -> ""
# Remove keywords and their variations
patterns = [
r'🍒', # 🍒 -> ""
r'BACKPORT', # BACKPORT -> ""
r'\[deps\]:', # [deps]: -> ""
r'feat(?:\([^)]*\))?:', # feat: or feat(ui): -> ""
r'bug(?:\([^)]*\))?:', # bug: or bug(core): -> ""
r'ci(?:\([^)]*\))?:' # ci: or ci(workflow): -> ""
]
for pattern in patterns:
line = re.sub(pattern, '', line)
# Replace multiple consecutive spaces with a single space
line = re.sub(r'\s+', ' ', line)
cleaned = line.strip()
original_stripped = original.strip()
if cleaned != original_stripped:
print(f"Processed: {original_stripped} -> {cleaned}")
return cleaned
def process_file(input_file: str, release_app_label: str) -> Tuple[List[str], List[str], List[str]]:
jira_tickets: List[str] = []
pr_numbers: List[str] = []
processed_lines: List[str] = []
debug_lines: List[str] = []
#community_highlights: List[str] = []
print("Processing file: ", input_file)
# GitHub API / CLI does not support fetching labels for multiple PRs in a single request
# individual requests are slow, we're caching the most recent merged PRs which should cover most cases
# falling back to individual requests if the PR is not in the cache
pr_label_cache = precache_pr_labels(500)
with open(input_file, 'r') as f:
for line in f:
line = line.strip()
should_process = line and line.startswith('* ')
if should_process:
pr_url = extract_pr_url(line)
pr_labels = []
# Fetch labels from PR URL if available
if pr_url:
pr_number = extract_pr_number_from_url(pr_url)
pr_numbers.append(pr_number)
# Check cache first, fallback to individual fetch
if pr_number in pr_label_cache:
pr_labels = pr_label_cache[pr_number]
print(f"Using cached labels for PR #{pr_number}")
else:
print(f"PR #{pr_number} not in cache, fetching individually...")
pr_labels = fetch_labels(pr_url)
if should_skip_pr(release_app_label, pr_labels):
debug_lines.append(f"{line} | skipped - labels: {pr_labels}")
continue # skip the PR if it is not labeled with the app label
tickets = extract_jira_tickets(line)
jira_tickets.extend(tickets)
prs = extract_pr_numbers(line)
pr_numbers.extend(prs)
processed_lines.append(process_line(line))
debug_lines.append(f"{line} | labels: {pr_labels}")
else:
processed_lines.append(line)
if line == "":
debug_lines.append("")
else:
debug_lines.append(f"{line} | skipped - processing")
# Remove duplicates while preserving order
jira_tickets = list(dict.fromkeys(jira_tickets))
pr_numbers = list(dict.fromkeys(pr_numbers))
print("Jira tickets:", ",".join(jira_tickets))
print("PR numbers:", ",".join(pr_numbers))
print("Finished processing file: ", input_file)
return jira_tickets, pr_numbers, processed_lines, debug_lines
def save_results(jira_tickets: List[str], pr_numbers: List[str], processed_lines: List[str], debug_lines: List[str],
jira_file: str = 'jira_tickets.txt',
pr_file: str = 'pr_numbers.txt',
processed_file: str = 'processed_notes.txt',
debug_file: str = 'processed_notes_debug.txt'
) -> None:
with open(jira_file, 'w') as f:
f.write('\n'.join(jira_tickets))
with open(pr_file, 'w') as f:
f.write('\n'.join(pr_numbers))
with open(processed_file, 'w') as f:
f.write('\n'.join(processed_lines))
with open(debug_file, 'w') as f:
f.write('\n'.join(debug_lines))
def parse_args():
"""Parse command line arguments.
Returns:
Parsed arguments namespace
"""
parser = argparse.ArgumentParser(
description='Process release notes by extracting Jira tickets and PR numbers, and cleaning up the text.'
)
parser.add_argument(
'release_app_label',
help='Filter PRs by app label (e.g., app:password-manager)'
)
parser.add_argument(
'input_file',
default='release_notes.txt',
help='Input file containing release notes (default: release_notes.txt)'
)
parser.add_argument(
'--processed-filepath',
default='processed_notes.txt',
help='Output file for processed notes (default: processed_notes.txt)'
)
parser.add_argument(
'--jira-filepath',
default='jira_tickets.txt',
help='Output file for Jira tickets (default: jira_tickets.txt)'
)
parser.add_argument(
'--pr-filepath',
default='pr_numbers.txt',
help='Output file for PR numbers (default: pr_numbers.txt)'
)
parser.add_argument(
'--debug-filepath',
default='processed_notes_debug.txt',
help='Output file for debug notes (default: processed_notes_debug.txt)'
)
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
jira_tickets, pr_numbers, processed_lines, debug_lines = process_file(
args.input_file,
args.release_app_label
)
save_results(
jira_tickets,
pr_numbers,
processed_lines,
debug_lines,
args.jira_filepath,
args.pr_filepath,
args.processed_filepath,
args.debug_filepath
)