From 61a3ceaf9ab8f3a018d07040294e8ccbad5f384a Mon Sep 17 00:00:00 2001
From: John McNamara <jmcnamara@cpan.org>
Date: Thu, 27 Mar 2025 17:58:06 +0000
Subject: [PATCH] test: update python test runner

---
 test/functional/helper_functions.py | 146 ++++++++++++++++++----------
 1 file changed, 93 insertions(+), 53 deletions(-)

diff --git a/test/functional/helper_functions.py b/test/functional/helper_functions.py
index 2cf7da84..8ee5e3c7 100644
--- a/test/functional/helper_functions.py
+++ b/test/functional/helper_functions.py
@@ -6,29 +6,26 @@
 # Copyright 2014-2025, John McNamara, jmcnamara@cpan.org.
 #
 
-import re
-import sys
 import os.path
-from zipfile import ZipFile
-from zipfile import BadZipfile
-from zipfile import LargeZipFile
+import re
+from zipfile import BadZipFile, LargeZipFile, ZipFile
 
 
 def _xml_to_list(xml_str):
     # Convert test generated XML strings into lists for comparison testing.
 
     # Split the XML string at tag boundaries.
-    parser = re.compile(r'>\s*<')
+    parser = re.compile(r">\s*<")
     elements = parser.split(xml_str.strip())
 
     elements = [s.replace("\r", "") for s in elements]
 
     # Add back the removed brackets.
     for index, element in enumerate(elements):
-        if not element[0] == '<':
-            elements[index] = '<' + elements[index]
-        if not element[-1] == '>':
-            elements[index] = elements[index] + '>'
+        if not element[0] == "<":
+            elements[index] = "<" + elements[index]
+        if not element[-1] == ">":
+            elements[index] = elements[index] + ">"
 
     return elements
 
@@ -43,7 +40,7 @@ def _vml_to_list(vml_str):
     vml_str = vml_str.replace("\r", "")
 
     vml = vml_str.split("\n")
-    vml_str = ''
+    vml_str = ""
 
     for line in vml:
         # Skip blank lines.
@@ -61,11 +58,11 @@ def _vml_to_list(vml_str):
             line += " "
 
         # Add newline after element end.
-        if re.search('>$', line):
+        if re.search(">$", line):
             line += "\n"
 
         # Split multiple elements.
-        line = line.replace('><', ">\n<")
+        line = line.replace("><", ">\n<")
 
         # Put all of Anchor on one line.
         if line == "<x:Anchor>\n":
@@ -109,28 +106,28 @@ def _compare_xlsx_files(got_file, exp_file, ignore_files, ignore_elements):
     # XML file into an list of XML elements.
     try:
         # Open the XlsxWriter as a zip file for testing.
-        got_zip = ZipFile(got_file, 'r')
+        got_zip = ZipFile(got_file, "r")
     except IOError:
         # For Python 2.5+ compatibility.
         e = sys.exc_info()[1]
         error = "XlsxWriter file error: " + str(e)
-        return error, ''
+        return error, ""
     except (BadZipfile, LargeZipFile):
         e = sys.exc_info()[1]
         error = "XlsxWriter zipfile error, '" + exp_file + "': " + str(e)
-        return error, ''
+        return error, ""
 
     try:
         # Open the Excel as a zip file for testing.
-        exp_zip = ZipFile(exp_file, 'r')
+        exp_zip = ZipFile(exp_file, "r")
     except IOError:
         e = sys.exc_info()[1]
         error = "Excel file error: " + str(e)
-        return error, ''
+        return error, ""
     except (BadZipfile, LargeZipFile):
         e = sys.exc_info()[1]
         error = "Excel zipfile error, '" + exp_file + "': " + str(e)
-        return error, ''
+        return error, ""
 
     # Get the filenames from the zip files.
     got_files = sorted(got_zip.namelist())
@@ -146,57 +143,58 @@ def _compare_xlsx_files(got_file, exp_file, ignore_files, ignore_elements):
 
     # Compare each file in the XLSX containers.
     for filename in exp_files:
-
         got_xml_str = got_zip.read(filename)
         exp_xml_str = exp_zip.read(filename)
 
         # Compare binary files with string comparison based on extension.
         extension = os.path.splitext(filename)[1]
-        if extension in ('.png', '.jpeg', '.gif','.bmp', '.bin'):
+        if extension in (".png", ".jpeg", ".gif", ".bmp", ".wmf", ".emf", ".bin"):
             if got_xml_str != exp_xml_str:
-                return 'got: %s' % filename, 'exp: %s' % filename
+                return f"got: {filename}", f"exp: {filename}"
             continue
 
-        if sys.version_info >= (3, 0, 0):
-            got_xml_str = got_xml_str.decode('utf-8')
-            exp_xml_str = exp_xml_str.decode('utf-8')
+        got_xml_str = got_xml_str.decode("utf-8")
+        exp_xml_str = exp_xml_str.decode("utf-8")
+
+        # Check for errant xml tags in the generated file.
+        if "<<" in got_xml_str:
+            return f"Double start tag in XlsxWriter file {filename}", ""
 
         # Remove dates and user specific data from the core.xml data.
-        if filename == 'docProps/core.xml':
-            exp_xml_str = re.sub(r' ?John', '', exp_xml_str)
-            exp_xml_str = re.sub(r'\d\d\d\d-\d\d-\d\dT\d\d\:\d\d:\d\dZ',
-                                 '', exp_xml_str)
-            got_xml_str = re.sub(r'\d\d\d\d-\d\d-\d\dT\d\d\:\d\d:\d\dZ',
-                                 '', got_xml_str)
+        if filename == "docProps/core.xml":
+            exp_xml_str = re.sub(r" ?John", "", exp_xml_str)
+            exp_xml_str = re.sub(
+                r"\d\d\d\d-\d\d-\d\dT\d\d\:\d\d:\d\dZ", "", exp_xml_str
+            )
+            got_xml_str = re.sub(
+                r"\d\d\d\d-\d\d-\d\dT\d\d\:\d\d:\d\dZ", "", got_xml_str
+            )
 
         # Remove workbookView dimensions which are almost always different
         # and calcPr which can have different Excel version ids.
-        if filename == 'xl/workbook.xml':
-            exp_xml_str = re.sub(r'<workbookView[^>]*>',
-                                 '<workbookView/>', exp_xml_str)
-            got_xml_str = re.sub(r'<workbookView[^>]*>',
-                                 '<workbookView/>', got_xml_str)
-            exp_xml_str = re.sub(r'<calcPr[^>]*>',
-                                 '<calcPr/>', exp_xml_str)
-            got_xml_str = re.sub(r'<calcPr[^>]*>',
-                                 '<calcPr/>', got_xml_str)
+        if filename == "xl/workbook.xml":
+            exp_xml_str = re.sub(r"<workbookView[^>]*>", "<workbookView/>", exp_xml_str)
+            got_xml_str = re.sub(r"<workbookView[^>]*>", "<workbookView/>", got_xml_str)
+            exp_xml_str = re.sub(r"<calcPr[^>]*>", "<calcPr/>", exp_xml_str)
+            got_xml_str = re.sub(r"<calcPr[^>]*>", "<calcPr/>", got_xml_str)
 
         # Remove printer specific settings from Worksheet pageSetup elements.
-        if re.match(r'xl/worksheets/sheet\d.xml', filename):
-            exp_xml_str = re.sub(r'horizontalDpi="200" ', '', exp_xml_str)
-            exp_xml_str = re.sub(r'verticalDpi="200" ', '', exp_xml_str)
-            exp_xml_str = re.sub(r'(<pageSetup[^>]*) r:id="rId1"',
-                                 r'\1', exp_xml_str)
+        if re.match(r"xl/worksheets/sheet\d.xml", filename):
+            exp_xml_str = re.sub(r'horizontalDpi="200" ', "", exp_xml_str)
+            exp_xml_str = re.sub(r'verticalDpi="200" ', "", exp_xml_str)
+            exp_xml_str = re.sub(r'(<pageSetup[^>]*) r:id="rId1"', r"\1", exp_xml_str)
 
         # Remove Chart pageMargin dimensions which are almost always different.
-        if re.match(r'xl/charts/chart\d.xml', filename):
-            exp_xml_str = re.sub(r'<c:pageMargins[^>]*>',
-                                 '<c:pageMargins/>', exp_xml_str)
-            got_xml_str = re.sub(r'<c:pageMargins[^>]*>',
-                                 '<c:pageMargins/>', got_xml_str)
+        if re.match(r"xl/charts/chart\d.xml", filename):
+            exp_xml_str = re.sub(
+                r"<c:pageMargins[^>]*>", "<c:pageMargins/>", exp_xml_str
+            )
+            got_xml_str = re.sub(
+                r"<c:pageMargins[^>]*>", "<c:pageMargins/>", got_xml_str
+            )
 
         # Convert the XML string to lists for comparison.
-        if re.search('.vml$', filename):
+        if re.search(".vml$", filename):
             got_xml = _xml_to_list(got_xml_str)
             exp_xml = _vml_to_list(exp_xml_str)
         else:
@@ -212,10 +210,14 @@ def _compare_xlsx_files(got_file, exp_file, ignore_files, ignore_elements):
                 got_xml = [tag for tag in got_xml if not re.match(pattern, tag)]
 
         # Reorder the XML elements in the XLSX relationship files.
-        if filename == '[Content_Types].xml' or re.search('.rels$', filename):
+        if filename == "[Content_Types].xml" or re.search(".rels$", filename):
             got_xml = _sort_rel_file_data(got_xml)
             exp_xml = _sort_rel_file_data(exp_xml)
 
+        # Indent the XML elements to make the visual comparison of failures easier.
+        got_xml = _indent_elements(got_xml)
+        exp_xml = _indent_elements(exp_xml)
+
         # Compared the XML elements in each file.
         if got_xml != exp_xml:
             got_xml.insert(0, filename)
@@ -223,4 +225,42 @@ def _compare_xlsx_files(got_file, exp_file, ignore_files, ignore_elements):
             return got_xml, exp_xml
 
     # If we got here the files are the same.
-    return 'Ok', 'Ok'
+    return "Ok", "Ok"
+
+
+def compare_xlsx_files(file1, file2, ignore_files=None, ignore_elements=None):
+    """
+    External wrapper function to allow simplified equality testing of two Excel
+    files. Note, this function doesn't test equivalence, only equality.
+
+    """
+    if ignore_files is None:
+        ignore_files = []
+
+    if ignore_elements is None:
+        ignore_elements = []
+
+    got, exp = _compare_xlsx_files(file1, file2, ignore_files, ignore_elements)
+
+    return got == exp
+
+
+# Indent XML elements to make the visual comparison of failures easier.
+def _indent_elements(xml_elements):
+    indent_level = 0
+    indented_elements = []
+
+    for element in xml_elements:
+        if element.startswith("</"):
+            indent_level -= 1
+
+        indented_elements.append("    " * indent_level + element)
+
+        if (
+            not element.startswith("</")
+            and "</" not in element
+            and not element.endswith("/>")
+        ):
+            indent_level += 1
+
+    return indented_elements