package jmri.util; import java.io.*; import java.nio.file.FileSystems; import java.nio.file.Path; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; import org.jsoup.*; import org.jsoup.nodes.*; import org.junit.jupiter.api.*; /** * Generates the search index for local help files. * * @author Daniel Bergqvist Copyright (C) 2022 */ public class GenerateSearchIndexTest { private static final boolean GENERATE_WHOLE_WORD_INDEX = true; private static final boolean GENERATE_BEGINNING_OF_WORD_INDEX = true; private static final boolean GENERATE_END_OF_WORD_INDEX = true; private static final boolean GENERATE_PART_OF_WORD_INDEX = false; private final Map _fileIndex = new HashMap<>(); private final Map _fileHeaderIndex = new HashMap<>(); private final Map>> _searchIndexWholeWord = new HashMap<>(); private final Map>> _searchIndexBeginning = new HashMap<>(); private final Map>> _searchIndexEnd = new HashMap<>(); private final Map>> _searchIndexPart = new HashMap<>(); private int _currentFileId = 0; // The main() method is used when this class is run directly from ant public static void main(String[] args) throws IOException { new GenerateSearchIndexTest().generateSearchIndex(); } private void addWord(String word, int fileId) { if (word.length() < 3) return; // Whole word Map> wordMapWholeWord = _searchIndexWholeWord.get(word.length()); if (wordMapWholeWord == null) { wordMapWholeWord = new HashMap<>(); _searchIndexWholeWord.put(word.length(), wordMapWholeWord); } Set fileIdListWholeWord = wordMapWholeWord.get(word); if (fileIdListWholeWord == null) { fileIdListWholeWord = new HashSet<>(); wordMapWholeWord.put(word, fileIdListWholeWord); } fileIdListWholeWord.add(fileId); for (int i=3; i <= word.length(); i++) { Map> wordMapBeginning = _searchIndexBeginning.get(i); Map> wordMapEnd = _searchIndexEnd.get(i); Map> wordMapPart = _searchIndexPart.get(i); if (wordMapBeginning == null) { wordMapBeginning = new HashMap<>(); _searchIndexBeginning.put(i, wordMapBeginning); wordMapEnd = new HashMap<>(); _searchIndexEnd.put(i, wordMapEnd); wordMapPart = new HashMap<>(); _searchIndexPart.put(i, wordMapPart); } for (int j=0; j <= word.length()-i; j++) { String part = word.substring(j, i+j); if (j==0) { // Beginning of a word Set fileIdList = wordMapBeginning.get(part); if (fileIdList == null) { fileIdList = new HashSet<>(); wordMapBeginning.put(part, fileIdList); } fileIdList.add(fileId); } if (i+j == word.length()) { // End of a word Set fileIdList = wordMapEnd.get(part); if (fileIdList == null) { fileIdList = new HashSet<>(); wordMapEnd.put(part, fileIdList); } fileIdList.add(fileId); } // Part of a word if (GENERATE_PART_OF_WORD_INDEX) { Set fileIdList = wordMapPart.get(part); if (fileIdList == null) { fileIdList = new HashSet<>(); wordMapPart.put(part, fileIdList); } fileIdList.add(fileId); } } } } private void parseHeader(Node node, String pad) { for (Node child : node.childNodes()) { if (child instanceof Element) { if ("title".equalsIgnoreCase(child.nodeName())) { String text = ((Element) child).ownText().trim().replaceAll("\r", "").replace("\n", "").replace("\"", "\\\\\""); if (!text.isBlank()) { if (_fileHeaderIndex.containsKey(_currentFileId)) { System.out.format("ERROR: Header already exists for file %s: %s --- %s%n", _fileIndex.get(_currentFileId), _fileHeaderIndex.get(_currentFileId), text); } else { _fileHeaderIndex.put(_currentFileId, text); } } } } parseHeader(child, pad+" "); } } private void parseNode(Node node, String pad) { for (Node child : node.childNodes()) { if (child instanceof TextNode) { TextNode textNode = (TextNode)child; String text = textNode.getWholeText().toLowerCase().trim(); String[] parts = text.split("\\W+"); for (String s : parts) { addWord(s, _currentFileId); } } parseNode(child, pad+" "); } } private void searchFolder(String folder) throws IOException { Path path = FileSystems.getDefault().getPath(folder); Set files = Stream.of(path.toFile().listFiles()) .filter(file -> !file.isDirectory()) .map(File::getName) .collect(Collectors.toSet()); for (String file : files) { if (file.endsWith(".shtml")) { String fileName = folder + file; _fileIndex.put(_currentFileId, fileName); Path filePath = FileSystems.getDefault().getPath(fileName); Document doc = Jsoup.parse(filePath.toFile(), "UTF-8"); parseHeader(doc.head(), ""); parseNode(doc.body(), ""); _currentFileId++; } } Set folders = Stream.of(path.toFile().listFiles()) .filter(file -> file.isDirectory()) .map(File::getName) .collect(Collectors.toSet()); for (String aFolder : folders) { searchFolder(folder + aFolder + "/"); } } private void createJsonFile() throws IOException { try (PrintWriter printWriter = new PrintWriter(new FileWriter( FileUtil.getProgramPath() + "help/en/local/search.json"))) { // Whole word if (GENERATE_WHOLE_WORD_INDEX) { printWriter.print("let searchIndexWholeWord = '{"); printWriter.print("\"files\":{"); for (Map.Entry entry : _fileIndex.entrySet()) { printWriter.format("\"%d\":[\"%s\",\"%s\"],", entry.getKey(), entry.getValue(), _fileHeaderIndex.get(entry.getKey())); } printWriter.print("\"-1\":\"\""); // Dummy data since we have a comma character after each data printWriter.print("},"); printWriter.print("\"words\":{"); for (Map.Entry>> searchIndexEntry : _searchIndexWholeWord.entrySet()) { printWriter.format("\"%d\":{", searchIndexEntry.getKey()); for (Map.Entry> wordMapEntry : searchIndexEntry.getValue().entrySet()) { printWriter.format("\"%s\":[", wordMapEntry.getKey()); for (int fileId : wordMapEntry.getValue()) { // System.out.format("%5d: %20s, %s%n", wordMap.getKey(), fileIdList.getKey(), _fileIndex.get(fileId)); printWriter.format("\"%d\",", fileId); } printWriter.print("\"-1\""); // Dummy data since we have a comma character after each data printWriter.print("],"); } printWriter.print("\"\":\"\""); // Dummy data since we have a comma character after each data printWriter.print("},"); } printWriter.print("\"-1\":\"\""); // Dummy data since we have a comma character after each data printWriter.print("}"); printWriter.println("}';"); } // Beginning of a word if (GENERATE_BEGINNING_OF_WORD_INDEX) { printWriter.print("let searchIndexBeginning = '{"); printWriter.print("\"files\":{"); for (Map.Entry entry : _fileIndex.entrySet()) { printWriter.format("\"%d\":[\"%s\",\"%s\"],", entry.getKey(), entry.getValue(), _fileHeaderIndex.get(entry.getKey())); } printWriter.print("\"-1\":\"\""); // Dummy data since we have a comma character after each data printWriter.print("},"); printWriter.print("\"words\":{"); for (Map.Entry>> searchIndexEntry : _searchIndexBeginning.entrySet()) { printWriter.format("\"%d\":{", searchIndexEntry.getKey()); for (Map.Entry> wordMapEntry : searchIndexEntry.getValue().entrySet()) { printWriter.format("\"%s\":[", wordMapEntry.getKey()); for (int fileId : wordMapEntry.getValue()) { // System.out.format("%5d: %20s, %s%n", wordMap.getKey(), fileIdList.getKey(), _fileIndex.get(fileId)); printWriter.format("\"%d\",", fileId); } printWriter.print("\"-1\""); // Dummy data since we have a comma character after each data printWriter.print("],"); } printWriter.print("\"\":\"\""); // Dummy data since we have a comma character after each data printWriter.print("},"); } printWriter.print("\"-1\":\"\""); // Dummy data since we have a comma character after each data printWriter.print("}"); printWriter.println("}';"); } // End of a word if (GENERATE_END_OF_WORD_INDEX) { printWriter.print("let searchIndexEnd = '{"); printWriter.print("\"files\":{"); for (Map.Entry entry : _fileIndex.entrySet()) { printWriter.format("\"%d\":[\"%s\",\"%s\"],", entry.getKey(), entry.getValue(), _fileHeaderIndex.get(entry.getKey())); } printWriter.print("\"-1\":\"\""); // Dummy data since we have a comma character after each data printWriter.print("},"); printWriter.print("\"words\":{"); for (Map.Entry>> searchIndexEntry : _searchIndexEnd.entrySet()) { printWriter.format("\"%d\":{", searchIndexEntry.getKey()); for (Map.Entry> wordMapEntry : searchIndexEntry.getValue().entrySet()) { printWriter.format("\"%s\":[", wordMapEntry.getKey()); for (int fileId : wordMapEntry.getValue()) { // System.out.format("%5d: %20s, %s%n", wordMap.getKey(), fileIdList.getKey(), _fileIndex.get(fileId)); printWriter.format("\"%d\",", fileId); } printWriter.print("\"-1\""); // Dummy data since we have a comma character after each data printWriter.print("],"); } printWriter.print("\"\":\"\""); // Dummy data since we have a comma character after each data printWriter.print("},"); } printWriter.print("\"-1\":\"\""); // Dummy data since we have a comma character after each data printWriter.print("}"); printWriter.println("}';"); } // Part of a word if (GENERATE_PART_OF_WORD_INDEX) { printWriter.print("let searchIndexPart = '{"); printWriter.print("\"files\":{"); for (Map.Entry entry : _fileIndex.entrySet()) { printWriter.format("\"%d\":[\"%s\",\"%s\"],", entry.getKey(), entry.getValue(), _fileHeaderIndex.get(entry.getKey())); } printWriter.print("\"-1\":\"\""); // Dummy data since we have a comma character after each data printWriter.print("},"); printWriter.print("\"words\":{"); for (Map.Entry>> searchIndexEntry : _searchIndexPart.entrySet()) { printWriter.format("\"%d\":{", searchIndexEntry.getKey()); for (Map.Entry> wordMapEntry : searchIndexEntry.getValue().entrySet()) { printWriter.format("\"%s\":[", wordMapEntry.getKey()); for (int fileId : wordMapEntry.getValue()) { // System.out.format("%5d: %20s, %s%n", wordMap.getKey(), fileIdList.getKey(), _fileIndex.get(fileId)); printWriter.format("\"%d\",", fileId); } printWriter.print("\"-1\""); // Dummy data since we have a comma character after each data printWriter.print("],"); } printWriter.print("\"\":\"\""); // Dummy data since we have a comma character after each data printWriter.print("},"); } printWriter.print("\"-1\":\"\""); // Dummy data since we have a comma character after each data printWriter.print("}"); printWriter.println("}';"); } } } private void generateSearchIndex() throws IOException { searchFolder("help/en/html/"); searchFolder("help/en/package/"); searchFolder("help/en/manual/"); createJsonFile(); } @Test public void testGenerateSearchIndex() throws IOException { Assumptions.assumeFalse( Boolean.getBoolean("jmri.skipBuildHelpFilesTest"), "Ignoring GenerateSearchIndexTest"); generateSearchIndex(); } @BeforeEach public void setUp() { JUnitUtil.setUp(); } @AfterEach public void tearDown() { JUnitUtil.tearDown(); } // private static final Logger log = LoggerFactory.getLogger(GenerateSearchIndexTest.class); }