#!/usr/bin/env python3 # encoding: utf-8 ## # This script checks all sound/music files in the repository for whether they've been modified or added without updating the file tracking sound or music copyright # This check was used in continuous integration for image files as well prior to September 2024. ## import argparse import contextlib import csv import hashlib from operator import itemgetter import os from pathlib import Path from subprocess import check_output import sys ## # csv file layout: # [0] = current git commit date # [1] = file path, relative to the repository root # [2] = license name(s) # [3] = authorship information # [4] = notes # [5] = new git commit date, if different from the value in [0] # [6] = current md5 hash ## ## # Add new licenses to this list: # Avoid things like "GNU GPL v2+;CC BY-SA 4.0", unless you mean to dual license # under either GNU GPL v2+ or CC BY-SA 4.0. GNU GPL v2+ and CC BY-SA 4.0 (e.g. # a GNU GPL v2+ file with CC BY-SA 4.0 modifications) isn't legally possible. ## known_licenses = ( "CC BY-SA 4.0", "CC0", "GNU GPL v2+", ) def do_git(file): return str(check_output(["git", "log", "-1", "--format=%ad", "--date=format:%Y/%m/%d", file]), 'UTF-8').rstrip('\n') def do_hash(file): md5 = hashlib.md5() with open(file, 'rb') as f: while True: data = f.read(65536) if not data: break md5.update(data) return str(md5.hexdigest()) ## # program logic start ## args = argparse.ArgumentParser() args.add_argument("--repo", default=".", help="The directory of the Wesnoth repository to run this script against.") args.add_argument("--output", default="copyrights.csv", help="The file to write the results of this script to.") args.add_argument("--input", default="copyrights.csv", help="The file to read the existing copyright data from.") options = args.parse_args() os.chdir(options.repo) csv_data = {} # Too few fields missing_fields = [] # Too many fields, possibly due to an unquoted comma extra_fields = [] # New images added = [] # Changed images changed = [] # Already mentioned in the CSV file, but lacking something in either the license or author fields incomplete = [] # Already mentioned in the CSV file, but have something in the needs update field update = [] unchanged = [] removed = [] # Sanity-check for known licenses unknown_licenses = [] with open(options.input, encoding="utf-8") as csvfile: reader = csv.reader(csvfile) previous_file = "" for row in reader: if row[0] == "Date": continue file = row[1] previous_file = file if not os.path.exists(file): removed.append(file) continue csv_data[file] = row with contextlib.suppress(FileNotFoundError): os.remove(options.output) for root, _, files in os.walk(options.repo): for filename in files: filetype = Path(filename).suffix if filetype == ".wav" or filetype == ".ogg": file_path = os.path.normpath(os.path.join(root, filename)) if os.path.sep != '/': # Always use slashes for the file path irrespective of OS used to run the update file_path = file_path.replace(os.path.sep, '/') file_hash = do_hash(file_path) if not file_path in csv_data: added.append(["", file_path, "", "", "", do_git(file_path), file_hash]) elif len(csv_data[file_path]) < 7: missing_fields.append(csv_data[file_path]) elif len(csv_data[file_path]) > 7: extra_fields.append(csv_data[file_path]) elif csv_data[file_path][5] != "": update.append(csv_data[file_path]) elif csv_data[file_path][6] != file_hash: csv_data[file_path][5] = do_git(file_path) csv_data[file_path][6] = file_hash changed.append(csv_data[file_path]) elif csv_data[file_path][2].strip() == "" or csv_data[file_path][3].strip() == "": incomplete.append(csv_data[file_path]) elif not csv_data[file_path][2] in known_licenses: unknown_licenses.append(csv_data[file_path][2]) incomplete.append(csv_data[file_path]) else: unchanged.append(csv_data[file_path]) final_output = missing_fields + extra_fields + added + changed + incomplete + update + unchanged final_output.sort(key=itemgetter(1)) if options.output != "": with open(options.output, 'w', encoding="utf-8") as f: writer = csv.writer(f, lineterminator="\n") writer.writerow(["Date", "File", "License", "Author - Real Name(other name);Real Name(other name);etc", "Notes", "Needs Update", "MD5"]) writer.writerows(final_output) else: writer = csv.writer(sys.stdout, lineterminator="\n") writer.writerows(final_output) any_check_failed = False if len(removed) > 0: any_check_failed = True print("There are "+str(len(removed))+" removed files") count_missing_fields = len(missing_fields) count_extra_fields = len(extra_fields) count_added = len(added) count_changed = len(changed) count_incomplete = len(incomplete) count_update = len(update) if count_missing_fields > 0 or count_extra_fields > 0 or count_added > 0 or count_changed > 0 or count_incomplete > 0 or count_update > 0: any_check_failed = True print("\nThere are "+str(count_missing_fields)+" rows with too few fields:\n"+"\n".join(",".join(a) for a in missing_fields)) print("\nThere are "+str(count_extra_fields)+" rows with too many fields, possibly due to an unquoted comma:\n"+"\n".join(",".join(a) for a in extra_fields)) print("\nThere are "+str(count_added)+" new files:\n"+"\n".join(a[1] for a in added)) print("\nThere are "+str(count_changed)+" changed files:\n"+"\n".join(a[1] for a in changed)) print("\nThere are "+str(count_incomplete)+" files that lack license or author information:\n"+"\n".join(a[1] for a in incomplete)) print("\nThere are "+str(count_update)+" files that need updated information:\n"+"\n".join(a[1] for a in update)) if len(unknown_licenses) > 0: any_check_failed = True print("Unknown licenses:") print(" " + "\n ".join(unknown_licenses)) if any_check_failed: sys.exit(1)