timescaledb/scripts/backport.py
Fabrízio de Royes Mello 04f0b47ca7 Force auto backport workflow
Currently if a PR that is elegible for backporting touches a workflow
file then the automatic backport fails.

Now adding the label `force-auto-backport-workflow` to a PR that touches
workflow files will proceed with the automatic backporting. This is
useful because sometimes we need to fix a workflow and backport it to
the current release branch or when we're adding support to a new
Postgres major version that requires workflow changes and it should be
backported to the release branch in case of creating patch releases.
2024-10-24 15:30:59 -03:00

529 lines
19 KiB
Python
Executable File

#!/usr/bin/env python3
import os
import re
import string
import subprocess
import sys
from github import Github # This is PyGithub.
import requests
# Limit our history search and fetch depth to this value, not to get stuck in
# case of a bug.
HISTORY_DEPTH = 1000
def run_query(query):
"""A simple function to use requests.post to make the GraphQL API call."""
request = requests.post(
"https://api.github.com/graphql",
json={"query": query},
headers={"Authorization": f'Bearer {os.environ.get("GITHUB_TOKEN")}'},
timeout=20,
)
response = request.json()
# Have to work around the unique GraphQL convention of returning 200 for errors.
if request.status_code != 200 or "errors" in response:
raise ValueError(
f"Query failed to run by returning code of {request.status_code}."
f"\nQuery: '{query}'"
f"\nResponse: '{request.json()}'"
)
return response
def get_referenced_issue(pr_number):
"""Get the number of issue fixed by the given pull request.
Returns None if no issue is fixed, or more than one issue"""
# We only need the first issue here. We also request only the first 30 labels,
# because GitHub requires some small restriction there that is counted
# towards the GraphQL API usage quota.
ref_result = run_query(
string.Template(
"""
query {
repository(owner: "timescale", name: "timescaledb") {
pullRequest(number: $pr_number) {
closingIssuesReferences(first: 1) {
nodes {
number, title,
labels (first: 30) { nodes { name } }
}
}
}
}
}
"""
).substitute({"pr_number": pr_number})
)
# The above returns:
# {'data': {'repository': {'pullRequest': {'closingIssuesReferences': {'nodes': [{'number': 6819,
# 'title': '[Bug]: Segfault when `ts_insert_blocker` function is called',
# 'labels': {'nodes': [{'name': 'bug'}]}}]}}}}}
#
# We can have {'nodes': [None]} in case it references an inaccessible repository,
# just ignore it.
ref_nodes = ref_result["data"]["repository"]["pullRequest"][
"closingIssuesReferences"
]["nodes"]
if not ref_nodes or len(ref_nodes) != 1 or not ref_nodes[0]:
return None, None, None
number = ref_nodes[0]["number"]
title = ref_nodes[0]["title"]
labels = {x["name"] for x in ref_nodes[0]["labels"]["nodes"]}
return number, title, labels
def set_auto_merge(pr_number):
"""Enable auto-merge for the given PR"""
owner, name = target_repo_name.split("/")
# We first have to find out the PR id, which is some base64 string, different
# from its number.
query = string.Template(
"""query {
repository(owner: "$owner", name: "$name") {
pullRequest(number: $pr_number) {
id
}
}
}"""
).substitute(pr_number=pr_number, owner=owner, name=name)
result = run_query(query)
pr_id = result["data"]["repository"]["pullRequest"]["id"]
query = string.Template(
"""mutation {
enablePullRequestAutoMerge(
input: {
pullRequestId: "$pr_id",
mergeMethod: REBASE
}
) {
clientMutationId
}
}"""
).substitute(pr_id=pr_id)
run_query(query)
def git_output(command):
"""Get output from the git command, checking for the successful exit code"""
return subprocess.check_output(f"git {command}", shell=True, text=True)
def git_check(command):
"""Run a git command, checking for the successful exit code"""
subprocess.run(f"git {command}", shell=True, check=True)
def git_returncode(command):
"""Run a git command, returning the exit code"""
return subprocess.run(f"git {command}", shell=True, check=False).returncode
# The token has to have the "access public repositories" permission, or else creating a PR returns 404.
github = Github(os.environ.get("GITHUB_TOKEN"))
# If we are running inside Github Action, will modify the main repo.
source_remote = "origin"
source_repo_name = os.environ.get("GITHUB_REPOSITORY")
target_remote = source_remote
target_repo_name = source_repo_name
if not source_repo_name:
# We are running manually for debugging, probably want to modify a fork.
source_repo_name = "timescale/timescaledb"
target_repo_name = os.environ.get("BACKPORT_TARGET_REPO")
target_remote = os.environ.get("BACKPORT_TARGET_REMOTE")
if not target_repo_name or not target_remote:
print(
"Please specify the target repositories for debugging, using the "
"environment variables BACKPORT_TARGET_REPO (e.g. `timescale/timescaledb`) "
"and BACKPORT_TARGET_REMOTE (e.g. `origin`).",
file=sys.stderr,
)
sys.exit(1)
print(
f"Will look at '{source_repo_name}' (git remote '{source_remote}') for bug fixes, "
f"and create the backport PRs in '{target_repo_name}' (git remote '{target_remote}')."
)
source_repo = github.get_repo(source_repo_name)
target_repo = github.get_repo(target_repo_name)
# Set git name and email corresponding to the token user.
token_user = github.get_user()
os.environ["GIT_COMMITTER_NAME"] = token_user.name
# This is an email that is used by Github when you opt to hide your real email
# address. It is required so that the commits are recognized by Github as made
# by the user. That is, if you use a wrong e-mail, there won't be a clickable
# profile picture next to the commit in the Github interface.
os.environ["GIT_COMMITTER_EMAIL"] = (
f"{token_user.id}+{token_user.login}@users.noreply.github.com"
)
print(
f"Will commit as {os.environ['GIT_COMMITTER_NAME']} <{os.environ['GIT_COMMITTER_EMAIL']}>"
)
# Fetch the main branch. Apparently the local repo can be shallow in some cases
# in Github Actions, so specify the depth. --unshallow will complain on normal
# repositories, this is why we don't use it here.
git_check(
f"fetch --quiet --depth={HISTORY_DEPTH} {source_remote} main:refs/remotes/{source_remote}/main"
)
# Find out what is the branch corresponding to the previous version compared to
# main. We will backport to that branch.
version_config = dict(
[
re.match(r"^(.+)\s+=\s+(.+)$", line).group(1, 2)
for line in git_output(f"show {source_remote}/main:version.config").splitlines()
if line
]
)
previous_version = version_config["update_from_version"]
previous_version_parts = previous_version.split(".")
previous_version_parts[-1] = "x"
backport_target = ".".join(previous_version_parts)
backported_label = f"backported-{backport_target}"
print(f"Will backport to {backport_target}.")
# Fetch the target branch. Apparently the local repo can be shallow in some cases
# in Github Actions, so specify the depth. --unshallow will complain on normal
# repositories, this is why we don't use it here.
git_check(
f"fetch --quiet --depth={HISTORY_DEPTH} {target_remote} {backport_target}:refs/remotes/{target_remote}/{backport_target}"
)
# Also fetch all branches from the target repository, because we use the presence
# of the backport branches to determine that a backport exists. It's not convenient
# to query for branch existence through the PyGithub API.
git_check(f"fetch {target_remote}")
# Find out which commits are unique to main and target branch. Also build sets of
# the titles of these commits. We will compare the titles to check whether a
# commit was backported.
main_commits = [
line.split("\t")
for line in git_output(
f'log -{HISTORY_DEPTH} --pretty="format:%h\t%s" {target_remote}/{backport_target}..{source_remote}/main'
).splitlines()
if line
]
print(f"Have {len(main_commits)} new commits in the main branch.")
branch_commits = [
line.split("\t")
for line in git_output(
f'log -{HISTORY_DEPTH} --pretty="format:%h\t%s" {source_remote}/main..{target_remote}/{backport_target}'
).splitlines()
if line
]
branch_commit_titles = {x[1] for x in branch_commits}
# We will do backports per-PR, because one PR, though not often, might contain
# many commits. So as the first step, go through the commits unique to main, find
# out which of them have to be backported, and remember the corresponding PRs.
# We also have to remember which commits to backport. The list from PR itself is
# not what we need, these are the original commits from the PR branch, and we
# need the resulting commits in master.
class PRInfo:
"""Information about the PR to be backported."""
def __init__(self, pygithub_pr_, issue_number_):
self.pygithub_pr = pygithub_pr_
self.pygithub_commits = []
self.issue_number = issue_number_
def should_backport_by_labels(number, title, labels):
"""Should we backport the given PR/issue, judging by the labels?
Note that this works in ternary logic:
True means we must,
False means we must not (tags to disable backport take precedence),
and None means weak no (no tags to either request or disable backport)"""
stopper_labels = labels.intersection(
["disable-auto-backport", "auto-backport-not-done"]
)
if stopper_labels:
print(
f"#{number} '{title}' is labeled as '{list(stopper_labels)[0]}' which prevents automated backporting."
)
return False
force_labels = labels.intersection(
["bug", "force-auto-backport", "force-auto-backport-workflow"]
)
if force_labels:
print(
f"#{number} '{title}' is labeled as '{list(force_labels)[0]}' which requests automated backporting."
)
return True
return None
# Go through the commits unique to main, and build a dict(pr number -> PRInfo)
# of PRs that we will consider for backporting.
prs_to_backport = {}
for commit_sha, commit_title in main_commits:
print()
pygithub_commit = source_repo.get_commit(sha=commit_sha)
pulls = pygithub_commit.get_pulls()
if not pulls or pulls.totalCount == 0:
print(f"{commit_sha[:9]} '{commit_title}' does not belong to a PR.")
continue
if pulls.totalCount > 1:
# What would that mean? Just play it safe and skip it.
print(
f"{commit_sha[:9]} '{commit_title}' references multiple PRs: {', '.join([pull.number for pull in pulls])}"
)
continue
pull = pulls[0]
# If a commit with the same title is already in the branch, mark the PR with
# a corresponding tag. This makes it easier to check what was backported
# when looking at the release milestone. Note that we do this before other
# checks -- maybe it was backported manually regardless of the usual
# conditions.
if commit_title in branch_commit_titles:
print(f"{commit_sha[:9]} '{commit_title}' is already in the branch.")
if backported_label not in {label.name for label in pull.labels}:
pull.add_to_labels(backported_label)
continue
# Next, we're going to look at the labels of both the PR and the linked
# issue, if any, to understand whether we should backport the fix. We have
# labels to request backport like "bug", and labels to prevent backport
# like "disable-auto-backport", on both issue and the PR. We're going to use
# the ternary False/None/True logic to combine them properly.
issue_number, issue_title, issue_labels = get_referenced_issue(pull.number)
if not issue_number:
should_backport_issue_ternary = None
print(
f"{commit_sha[:9]} belongs to the PR #{pull.number} '{pull.title}' that does not close an issue."
)
else:
issue = source_repo.get_issue(number=issue_number)
should_backport_issue_ternary = should_backport_by_labels(
issue_number, issue_title, issue_labels
)
print(
f"{commit_sha[:9]} belongs to the PR #{pull.number} '{pull.title}' "
f"that references the issue #{issue.number} '{issue.title}'."
)
pull_labels = {label.name for label in pull.labels}
should_backport_pr_ternary = should_backport_by_labels(
pull.number, pull.title, pull_labels
)
# We backport if either the PR or the issue labels request the backport, and
# none of them prevent it. I'm writing it with `is True` because I don't
# remember python rules for ternary logic with None (do you?).
if (
should_backport_pr_ternary is True or should_backport_issue_ternary is True
) and (
should_backport_pr_ternary is not False
and should_backport_issue_ternary is not False
):
print(f"{commit_sha[:9]} '{commit_title}' will be considered for backporting.")
else:
continue
# Remember the PR and the corresponding resulting commit in main.
if pull.number not in prs_to_backport:
prs_to_backport[pull.number] = PRInfo(pull, issue_number)
# We're traversing the history backwards, and want to have the list of
# commits in forward order.
prs_to_backport[pull.number].pygithub_commits.insert(0, pygithub_commit)
def report_backport_not_done(original_pr, reason, details=None):
"""If something prevents us from backporting the PR automatically,
report it in a comment to original PR, and add a label preventing
further attempts."""
print(
f"Will not backport the PR #{original_pr.number} '{original_pr.title}': {reason}"
)
github_comment = f"Automated backport to {backport_target} not done: {reason}."
if details:
github_comment += f"\n\n{details}"
# Link to the job if we're running in the Github Action environment.
if "GITHUB_REPOSITORY" in os.environ:
github_comment += (
"\n\n"
f"[Job log](https://github.com/{os.environ.get('GITHUB_REPOSITORY')}"
f"/actions/runs/{os.environ.get('GITHUB_RUN_ID')}"
f"/attempts/{os.environ.get('GITHUB_RUN_ATTEMPT')})"
)
original_pr.create_issue_comment(github_comment)
original_pr.add_to_labels("auto-backport-not-done")
# Now, go over the list of PRs that we have collected, and try to backport
# each of them.
print(f"Have {len(prs_to_backport)} PRs to backport.")
for index, pr_info in enumerate(prs_to_backport.values()):
print()
# Don't want to have an endless loop that modifies the repository in an
# unattended script. The already backported/conflicted PRs shouldn't even
# get into this list, so the low number is OK, it will still make progress.
if index > 5:
print(f"{index} PRs processed, stopping as a precaution.")
sys.exit(0)
original_pr = pr_info.pygithub_pr
backport_branch = f"backport/{backport_target}/{original_pr.number}"
# If there is already a backport branch for this PR, this probably means
# that we already created the backport PR. Skip it.
if (
git_returncode(f"rev-parse {target_remote}/{backport_branch} > /dev/null 2>&1")
== 0
):
print(
f'Backport branch {backport_branch} for PR #{original_pr.number}: "{original_pr.title}" already exists. Skipping.'
)
continue
# Try to cherry-pick the commits.
git_check(
f"checkout --quiet --detach {target_remote}/{backport_target} > /dev/null"
)
commit_shas = [commit.sha for commit in pr_info.pygithub_commits]
if git_returncode(f"cherry-pick --quiet -m 1 -x {' '.join(commit_shas)}") != 0:
details = f"### Git status\n\n```\n{git_output('status')}\n```"
git_check("cherry-pick --abort")
report_backport_not_done(original_pr, "cherry-pick failed", details)
continue
# We don't have the permission to modify workflows
changed_files = {file.filename for file in original_pr.get_files()}
changed_workflow_files = {
filename
for filename in changed_files
if filename.startswith(".github/workflows/")
}
if changed_workflow_files:
pull_labels = {label.name for label in original_pr.labels}
force_workflow_label = pull_labels.intersection(
["force-auto-backport-workflow"]
)
if not force_workflow_label:
details = (
f"The PR touches a workflow file '{list(changed_workflow_files)[0]}' "
" and cannot be backported automatically"
)
report_backport_not_done(original_pr, "backport failed", details)
continue
print(
f"PR #{original_pr.number} '{original_pr.title}' touches a workflow file, but will be backported anyway."
)
# Push the backport branch.
git_check(f"push --quiet {target_remote} @:refs/heads/{backport_branch}")
# Prepare description for the backport PR.
backport_description = (
f"This is an automated backport of #{original_pr.number}: {original_pr.title}."
)
if pr_info.issue_number:
backport_description += f"\nThe original issue is #{pr_info.issue_number}."
# Do not merge the PR automatically if it changes some particularly
# conflict-prone files that are better to review manually. Also mention this
# in the description.
stopper_files = changed_files.intersection(
["sql/updates/latest-dev.sql", "sql/updates/reverse-dev.sql"]
)
if stopper_files:
backport_description += (
"\n"
f"This PR will not be merged automatically, because it modifies '{list(stopper_files)[0]}' "
"which is conflict-prone. Please review these changes manually."
)
else:
backport_description += (
"\n"
"This PR will be merged automatically after all the relevant CI checks pass."
)
backport_description += (
" If this fix should not be backported, or will be backported manually, "
"just close this PR. You can use the backport branch to add your "
"changes, it won't be modified automatically anymore."
"\n"
"\n"
"For more details, please see the [documentation]"
"(https://github.com/timescale/eng-database/wiki/Releasing-TimescaleDB#automated-cherry-picking-of-bug-fixes)"
)
# Add original PR description. Comment out the Github issue reference
# keywords like 'Fixes #1234', to avoid having multiple PRs saying they fix
# a given issue. The backport PR is going to reference the fixed issue as
# "Original issue #xxxx".
original_description = re.sub(
r"((fix|clos|resolv)[esd]+)(\s+#[0-9]+)",
r"`\1`\3",
original_pr.body,
flags=re.IGNORECASE,
)
backport_description += (
"\n"
"\n"
"## Original description"
"\n"
f"### {original_pr.title}"
"\n"
f"{original_description}"
)
# Create the backport PR.
backport_pr = target_repo.create_pull(
title=f"Backport to {backport_target}: #{original_pr.number}: {original_pr.title}",
body=backport_description,
head=backport_branch,
base=backport_target,
)
backport_pr.add_to_labels("is-auto-backport")
backport_pr.add_to_assignees(original_pr.user.login)
if not stopper_files:
set_auto_merge(backport_pr.number)
print(
f"Created backport PR #{backport_pr.number} for #{original_pr.number}: {original_pr.title}"
)