#!/usr/bin/perl
#
# Converts Wesnoth copyrights.csv to debian/copyright
# Dependencies: Text-CSV_XS
#
# Copyright (C) 2024  P. J. McDermott
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

use English qw{-no_match_vars};
use strict;
use warnings;

use Text::Wrap;  # In Perl core
if (!eval('use Text::CSV_XS qw{csv}; 1;')) {
	STDERR->print("debian/csv2dep5.pl requires the " .
		"libtext-csv-xs-perl package\n");
	exit(1);
}

# Update this Wesnoth CSV => DEP5/SPDX map as valid licenses are found.
my %LICENSES = (
	'CC BY-SA 4.0' => 'CC-BY-SA-4',
	'CC0'          => 'CC0-1',
	'GNU GPL v2+'  => 'GPL-2+',
);

# Read "debian/copyright".
sub read_dep5
{
	my ($d_c_files, $d_c_licenses, $licenses) = @_;
	my $d_c_fh;
	my $d_c;

	# Read copyright file generated by branchcheck.
	if (!open($d_c_fh, '<:encoding(UTF-8)', 'debian/copyright')) {
		STDERR->print("Failed to open debian/copyright for reading\n");
		return;
	}
	{
		local $INPUT_RECORD_SEPARATOR = undef;
		$d_c = readline($d_c_fh);
	}
	if (!close($d_c_fh)) {
		STDERR->print("Failed to close debian/copyright\n");
		return;
	}

	# Split "Files:" stanzas from "License:" stanzas, so we can inject
	# between them.
	(${$d_c_files}, ${$d_c_licenses}) = $d_c =~ m{
		\A
		(
			.*          # Match anything ...
			?           # ... but not "License:" (don't be greedy)
			\n          # Line break 1
		)
		(
			\n          # Line break 2
			License:.*  # First license stanza
		)
		\z
	}msx;

	# Find existing license stanzas so we can warn about missing ones later.
	foreach my $line (split(qr{\n}msx, ${$d_c_licenses})) {
		if ($line =~ m{\ALicense:}msx) {
			$line =~ s{\ALicense:\s*}{}msx;
			$licenses->{$line} = 1;
		}
	}

	return 1;
}

# Read "copyrights.csv".
sub read_csv
{
	my ($stanzas, $notes, $dir_refs, $licenses, $missing_license_names) =
		@_;
	my $copyrights;
	my $node;

	# Parse copyrights.csv.
	if (!($copyrights = csv('in' => 'copyrights.csv', 'headers' => 'skip',
				'encoding' => 'UTF-8'))) {
		STDERR->printf("Failed to parse copyrights.csv: %s\n",
			Text::CSV_XS->error_diag());
		return;
	}
	# 1.17.25 (commit e07acf71dc9) ensures sorting by filename, but be
	# paranoid.
	$copyrights = [sort({$a->[1] cmp $b->[1]} @{$copyrights})];

	# Process rows.
	foreach my $row (@{$copyrights}) {
		my ($date, $file, $license, $author, $note, $update, $md5) =
			@{$row};

		# attic/ files aren't in upstream tar archives.
		if ($file =~ m{^attic/}) {
			next;
		}

		# Sane are we?
		if (!defined($LICENSES{$license})) {
			# Check for nonsense like "GNU GPL v2+;CC BY-SA 4.0"
			# (a GPL-2+ # file with CC-BY-SA-4 modifications isn't
			# legally possible).  A forwarded patch should catch
			# this:
			#     https://github.com/wesnoth/wesnoth/pull/8254
			#     https://github.com/wesnoth/wesnoth/commit/4c87a4be
			# The whole point of this conversion is to group files
			# into stanzas by license and notes, so without a valid
			# license field, we have nothing to do.  Also, there
			# might be a legal issue with the package.
			STDERR->printf("Unknown or invalid license \"%s\"\n",
				$license);
			return;
		}
		$license = $LICENSES{$license};
		if (!defined($licenses->{$license})) {
			# No matching "License:" stanza in debian/copyright.
			# Remind later.
			$missing_license_names->{$license} = 1;
		}
		if ($update =~ m{\A\d{4}/\d{2}/\d{2}\z}msx) {
			# update_copyrights sets the "Needs Update" field to
			# a new date if a file's MD5 hash changes.  Sometimes
			# people miss this, e.g. in upstream commit 1ecd4f4d599c
			# (pull #8195).  A forwarded patch should catch this:
			#     https://github.com/wesnoth/wesnoth/pull/8254
			#     https://github.com/wesnoth/wesnoth/commit/efad8a0f
			$date = $update;
			$update = '';
		} elsif ($update ne '') {
			# People sometimes want to use a comma in the "Notes"
			# field and instead overflow into the "Needs Update"
			# field, e.g. in upstream commit 97c8feb8ca3b (pull
			# #7903).  A forwarded patch should catch this:
			#     https://github.com/wesnoth/wesnoth/pull/8254
			#     https://github.com/wesnoth/wesnoth/commit/efad8a0f
			$note .= ',' . $update;
		}

		# Build list of stanzas with common "License:" and "Comment:"
		# fields.
		if (!defined($stanzas->{$note})) {
			$stanzas->{$note} = {};
			if ($note ne '') {
				# Keep notes sorted by filename.
				push(@{$notes}, $note);
			}
		}
		if (!defined($stanzas->{$note}->{$license})) {
			$stanzas->{$note}->{$license} = [];
		}
		push(@{$stanzas->{$note}->{$license}}, [$file, $author]);

		# Reference count directories.  Build a hash tree like:
		# {
		#     'attic' => {                       # Directory entry name
		#         '' => 472,                     # Refcnt: files in tree
		#         '/GPL-2+' => { '' => 1 },      # License => Notes => 1
		#         'character-box.png => {        # Directory entry name
		#             '' => 1,                   # Refcnt: files in tree
		#             '/GPL-2+' => { '' => 1 },  # License => Notes => 1
		#         },
		#         ...
		#     },
		#     ....
		# }
		$node = $dir_refs;  # Root directory
		# Protect against files with names like "GPL-2+".  File names
		# can't contain "/", so use it in license names.
		$license = '/' . $license;
		foreach my $dirent (split(qr{/}msx, $file)) {
			if (!defined($node->{$dirent})) {
				# Dirent must be created.
				$node->{$dirent} = {};
				$node->{$dirent}->{''} = 0;  # Reference count
			}
			$node = $node->{$dirent};
			if (!defined($node->{$license})) {
				$node->{$license} = {};
			}
			if (!defined($node->{$license}->{$note})) {
				# Dirent already exists, but stanza hasn't been
				# counted.
				$node->{$license}->{$note} = 1;
				++$node->{''};  # Reference count
			}
		}
	}

	return 1;
}

# After listing a directory wildcard, clear reference counts of all nodes in the
# subtree so files in the subtree aren't listed separately.
sub clear_subtree
{
	my ($subtree) = @_;

	$subtree->{''} = 0;

	foreach my $node (grep(m{\A[^/]}msx, keys(%{$subtree}))) {
		clear_subtree($subtree->{$node});
	}

	return;
}

# Write a files stanza.
sub write_stanza
{
	my ($fh, $license, $notes, $rows, $dir_refs) = @_;
	my $path;
	my $node;
	my %authors;

	$fh->print("\nFiles:\n");
	foreach my $row (@{$rows}) {
		$path = undef;
		$node = $dir_refs;
		foreach my $dirent (split(qr{/}msx, $row->[0])) {
			if (!defined($path)) {
				$path = $dirent;
			} else {
				$path .= '/' . $dirent;
			}
			$node = $node->{$dirent};
			if ($node->{''} == 1) {
				# Reference count + license=>notes == 2
				if (scalar(%{$node}) > 2) {
					$path .= '/*';
				}
				$fh->printf(" %s\n", $path);
				# Don't repeat this subtree.
				clear_subtree($node);
				last;
			}
		}
	}
	$fh->print("Copyright:\n");
	foreach my $row (@{$rows}) {
		foreach my $author (split(qr{;}msx, $row->[1])) {
			$authors{$author} = 1;
		}
	}
	$fh->printf("  %s\n", join("\n  ", sort(keys(%authors))));
	$fh->printf("License: %s\n", $license);
	if ($notes ne '') {
		## no critic (Variables::ProhibitPackageVars)
		$Text::Wrap::huge = 'overflow';
		## no critic (ValuesAndExpressions::ProhibitMagicNumbers)
		# 79-column lines (wrap-and-sort default since 2.14.6, #756067)
		$Text::Wrap::columns = 79 + 1;
		## use critic (ValuesAndExpressions::ProhibitMagicNumbers)
		## use critic (Variables::ProhibitPackageVars)
		$fh->print("Comments:\n" . wrap(' ', ' ', $notes) . "\n");
	}

	return;
}

# Write "debian/copyright".
## no critic (Subroutines::ProhibitManyArgs)
sub write_dep5
## use critic (Subroutines::ProhibitManyArgs)
{
	my ($d_c_files, $d_c_licenses,
		$stanzas, $notes, $dir_refs, $missing_license_names) = @_;
	my $d_c_fh;

	# Open and write old files stanzas.
	## no critic (InputOutput::RequireBriefOpen)
	if (!open($d_c_fh, '>:encoding(UTF-8)', 'debian/copyright~')) {
	## use critic (InputOutput::RequireBriefOpen)
		STDERR->print("Failed to open debian/copyright~ for writing\n");
		return;
	}
	$d_c_fh->print(${$d_c_files});

	# Write new files stanzas.
	foreach my $license (sort(keys(%{$stanzas->{''}}))) {
		write_stanza($d_c_fh, $license, '', $stanzas->{''}->{$license},
			$dir_refs);
	}
	foreach my $note (@{$notes}) {
		foreach my $license (sort(keys(%{$stanzas->{$note}}))) {
			write_stanza($d_c_fh, $license, $note,
				$stanzas->{$note}->{$license}, $dir_refs);
		}
	}

	# Write old license stanzas, close, and rename.
	$d_c_fh->print(${$d_c_licenses});
	if (!close($d_c_fh)) {
		STDERR->print("Failed to close debian/copyright\n");
		return;
	}
	if (!rename('debian/copyright~', 'debian/copyright')) {
		STDERR->print("Failed to rename debian/copyright~\n");
		return;
	}

	# Warn of missing license stanzas for new files.
	if (scalar(keys(%{$missing_license_names}))) {
		STDERR->print(
			"debian/copyright missing stanzas for licenses:\n  ");
		STDERR->print(join("\n  ",
				sort(keys(%{$missing_license_names}))) . "\n");
	}

	return 1;
}

sub main
{
	my $d_c_files;
	my $d_c_licenses;
	my %licenses;
	my %stanzas;
	my @notes;
	my %dir_refs;
	my %missing_license_names;

	read_dep5(\$d_c_files, \$d_c_licenses, \%licenses)
		or return 1;

	# packaging/org.wesnoth.Wesnoth.appdata.xml
	$dir_refs{'packaging'} = {'' => 1};
	# utils/autorevision.sh, utils/tagfind, utils/wxdiff
	$dir_refs{'utils'} = {'' => 1};
	read_csv(\%stanzas, \@notes, \%dir_refs, \%licenses,
		\%missing_license_names)
		or return 1;

	write_dep5(\$d_c_files, \$d_c_licenses,
		\%stanzas, \@notes, \%dir_refs, \%missing_license_names)
		or return 1;

	return 0;
}

exit(main());
