diff --git a/mailinglist/src/main/java/org/openjdk/skara/mailinglist/Mbox.java b/mailinglist/src/main/java/org/openjdk/skara/mailinglist/Mbox.java index 4659237b..881ef1ad 100644 --- a/mailinglist/src/main/java/org/openjdk/skara/mailinglist/Mbox.java +++ b/mailinglist/src/main/java/org/openjdk/skara/mailinglist/Mbox.java @@ -37,7 +37,7 @@ private final static Logger log = Logger.getLogger("org.openjdk.skara.mailinglist"); private final static Pattern mboxMessagePattern = Pattern.compile( - "^\\R^(From (?:.(?!^\\R^From ))*)", Pattern.MULTILINE | Pattern.DOTALL); + "^(From (?:.(?!^\\R^From ))*)", Pattern.MULTILINE | Pattern.DOTALL); private final static DateTimeFormatter ctimeFormat = DateTimeFormatter.ofPattern( "EEE LLL dd HH:mm:ss yyyy", Locale.US); private final static Pattern fromStringEncodePattern = Pattern.compile("^(>*From )", Pattern.MULTILINE); @@ -46,15 +46,30 @@ private final static Pattern decodedQuotedPrintablePattern = Pattern.compile("=\\?utf-8\\?b\\?(.*?)\\?="); private static List splitMbox(String mbox) { + // Initial split var messages = mboxMessagePattern.matcher(mbox).results() .map(match -> match.group(1)) + .filter(message -> message.length() > 0) + .map(Mbox::decodeFromStrings) + .map(Mbox::decodeQuotedPrintable) .collect(Collectors.toList()); - return messages.stream() - .filter(message -> message.length() > 0) - .map(Mbox::decodeFromStrings) - .map(Mbox::decodeQuotedPrintable) - .map(Email::parse) - .collect(Collectors.toList()); + + // Pipermail can occasionally fail to encode 'From ' in message bodies, try to handle this + var messageBuilder = new StringBuilder(); + var parsedMails = new ArrayList(); + Collections.reverse(messages); + for (var message : messages) { + messageBuilder.insert(0, message); + try { + var email = Email.parse(messageBuilder.toString()); + parsedMails.add(email); + messageBuilder.setLength(0); + } catch (RuntimeException ignored) { + } + } + + Collections.reverse(parsedMails); + return parsedMails; } private static String encodeFromStrings(String body) { diff --git a/mailinglist/src/test/java/org/openjdk/skara/mailinglist/MboxTests.java b/mailinglist/src/test/java/org/openjdk/skara/mailinglist/MboxTests.java index 6c6fb262..e6862628 100644 --- a/mailinglist/src/test/java/org/openjdk/skara/mailinglist/MboxTests.java +++ b/mailinglist/src/test/java/org/openjdk/skara/mailinglist/MboxTests.java @@ -27,9 +27,12 @@ import org.junit.jupiter.api.Test; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.time.Duration; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.*; class MboxTests { @Test @@ -155,4 +158,30 @@ void utf8Encode() { assertEquals(sentMail, conversation.first()); } } + + @Test + void unencodedFrom() throws IOException { + try (var folder = new TemporaryDirectory()) { + var rawMbox = folder.path().resolve("test.mbox"); + Files.writeString(rawMbox, + "From test at example.com Wed Aug 21 17:22:50 2019\n" + + "From: test at example.com (test at example.com)\n" + + "Date: Wed, 21 Aug 2019 17:22:50 +0000\n" + + "Subject: this is a test\n" + + "Message-ID: \n" + + "\n" + + "Sometimes there are unencoded from lines as well\n" + + "\n" + + "From this point onwards, it may be hard to parse this\n" + + "\n", StandardCharsets.UTF_8); + var mbox = MailingListServerFactory.createMboxFileServer(folder.path()); + var list = mbox.getList("test"); + var conversations = list.conversations(Duration.ofDays(30)); + assertEquals(1, conversations.size()); + var conversation = conversations.get(0); + assertEquals(1, conversation.allMessages().size()); + assertTrue(conversation.first().body().contains("there are unencoded"), conversation.first().body()); + assertTrue(conversation.first().body().contains("this point onwards"), conversation.first().body()); + } + } }