package jmri.util; import java.io.*; import java.nio.file.*; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; import org.junit.jupiter.api.*; /** * Check help files for UTF-8 characters. * Files that contain &l;tmeta charset="utf-8"> are exempt. * * @author Daniel Bergqvist Copyright (C) 2022 */ public class CheckHelpFilesForUTF8Test { private final Map convertChar = new HashMap<>(); private final Set foundChar = new HashSet<>(); private int numErrors = 0; private void searchFolder(String folder) throws IOException { Path path = FileSystems.getDefault().getPath(folder); Set files = Stream.of(path.toFile().listFiles()) .filter(file -> !file.isDirectory()) .map(File::getName) .collect(Collectors.toSet()); for (String file : files) { if (file.endsWith(".shtml")) { String fileName = folder + file; var lines = Files.readAllLines(Paths.get(fileName), StandardCharsets.UTF_8); for (String s : lines) { if (s.contains("")) break; // no further testing for UTF s.codePoints().forEach((codePoint) -> { if (codePoint > 127) { numErrors++; foundChar.add(codePoint); String expected = convertChar.get(codePoint); log.error( "Invalid character. Codepoint: {}, Character: {}, Replace with: {}, File: {}", codePoint, new String(Character.toChars(codePoint)), expected, fileName); } }); } } } Set folders = Stream.of(path.toFile().listFiles()) .filter(file -> file.isDirectory()) .map(File::getName) .collect(Collectors.toSet()); for (String aFolder : folders) { searchFolder(folder + aFolder + "/"); } } @Test public void testGenerateSearchIndex() throws IOException { // See: https://www.w3schools.com/charsets/ref_utf_punctuation.asp convertChar.put(169, "©"); convertChar.put(174, "®"); convertChar.put(176, "°"); convertChar.put(200, "È"); convertChar.put(201, "É"); convertChar.put(220, "Ü"); convertChar.put(223, "ß"); convertChar.put(224, "à"); convertChar.put(225, "á"); convertChar.put(226, "â"); convertChar.put(228, "ä"); convertChar.put(229, "å"); convertChar.put(230, "æ"); convertChar.put(231, "ç"); convertChar.put(232, "è"); convertChar.put(233, "é"); convertChar.put(234, "ê"); convertChar.put(237, "í"); convertChar.put(241, "ñ"); convertChar.put(244, "ô"); convertChar.put(246, "ö"); convertChar.put(248, "ø"); convertChar.put(252, "ü"); convertChar.put(253, "ý"); convertChar.put(268, "Č"); convertChar.put(283, "ě"); convertChar.put(339, "œ"); convertChar.put(345, "ř"); convertChar.put(352, "Š"); convertChar.put(381, "Ž"); convertChar.put(8209, "‑"); convertChar.put(8211, "–"); convertChar.put(8212, "—"); convertChar.put(8216, "‘"); convertChar.put(8217, "’"); convertChar.put(8220, "“"); convertChar.put(8221, "”"); convertChar.put(8226, "•"); convertChar.put(8230, "…"); convertChar.put(8250, "›"); convertChar.put(8482, "™"); convertChar.put(8594, "→"); convertChar.put(8629, "↵"); convertChar.put(8658, "⇒"); convertChar.put(9662, "▾"); convertChar.put(10004, "✔"); searchFolder("help/en/"); for (int codePoint : foundChar) { String expected = convertChar.get(codePoint); log.error("Found UTF-8 Codepoint: {}, Character: {}. Expected: {}", codePoint, new String(Character.toChars(codePoint)), expected); } if (numErrors > 0) log.error("Num errors: {}", numErrors); } @BeforeEach public void setUp() { JUnitUtil.setUp(); } @AfterEach public void tearDown() { JUnitUtil.tearDown(); } private static final org.slf4j.Logger log = org.slf4j.LoggerFactory.getLogger(CheckHelpFilesForUTF8Test.class); }