Compare commits

...

5 Commits

21 changed files with 318 additions and 52 deletions

View File

@ -16,6 +16,9 @@ val kotlinCoroutinesVersion: String by project
val mustangVersion: String by project
val textInfoExtractor: String by project
val pdfboxTextExtractor: String by project
val angusMailVersion: String by project
val klfVersion: String by project
@ -30,6 +33,10 @@ dependencies {
implementation("org.mustangproject:library:$mustangVersion")
implementation("org.mustangproject:validator:$mustangVersion")
// pdf invoice data extraction
api("net.dankito.text.extraction:text-info-extractor:$textInfoExtractor")
api("net.dankito.text.extraction:pdfbox-text-extractor:$pdfboxTextExtractor")
implementation("org.eclipse.angus:angus-mail:$angusMailVersion")
implementation("net.codinux.log:klf:$klfVersion")

View File

@ -9,11 +9,17 @@ import kotlinx.coroutines.*
import net.codinux.invoicing.email.model.*
import net.codinux.invoicing.filesystem.FileUtil
import net.codinux.invoicing.model.Invoice
import net.codinux.invoicing.pdf.PdfInvoiceData
import net.codinux.invoicing.pdf.PdfInvoiceDataExtractor
import net.codinux.invoicing.reader.EInvoiceReader
import net.codinux.invoicing.util.ExceptionHelper
import net.codinux.log.logger
import org.eclipse.angus.mail.imap.IMAPFolder
import org.eclipse.angus.mail.imap.IMAPMessage
import org.eclipse.angus.mail.util.MailConnectException
import java.io.File
import java.net.ConnectException
import java.net.UnknownHostException
import java.time.Instant
import java.util.*
import java.util.concurrent.Executors
@ -21,7 +27,9 @@ import kotlin.math.max
open class EmailsFetcher(
protected open val eInvoiceReader: EInvoiceReader = EInvoiceReader(),
protected open val coroutineDispatcher: CoroutineDispatcher = Executors.newFixedThreadPool(max(24, Runtime.getRuntime().availableProcessors() * 4)).asCoroutineDispatcher()
protected open val pdfInvoiceDataExtractor: PdfInvoiceDataExtractor = PdfInvoiceDataExtractor(),
protected open val coroutineDispatcher: CoroutineDispatcher = Executors.newFixedThreadPool(max(24, Runtime.getRuntime().availableProcessors() * 4)).asCoroutineDispatcher(),
protected open val exceptionHelper: ExceptionHelper = ExceptionHelper()
) {
protected data class MessagePart(
@ -40,34 +48,64 @@ open class EmailsFetcher(
protected val log by logger()
open fun checkCredentials(account: EmailAccount): CheckCredentialsResult {
try {
val status = connect(account, FetchEmailsOptions(showDebugOutputOnConsole = true))
close(status)
return CheckCredentialsResult.Ok
} catch (e: Throwable) {
log.info(e) { "Could not connect to account '$account'" }
return mapConnectResultError(e)
}
}
protected open fun mapConnectResultError(exception: Throwable): CheckCredentialsResult {
if (exception is AuthenticationFailedException) {
return CheckCredentialsResult.WrongUsername
} else if (exception is MailConnectException) {
val innerInnerException = exceptionHelper.getInnerException(exception, 1)
if (innerInnerException is UnknownHostException) {
return CheckCredentialsResult.InvalidImapServerAddress
} else if (innerInnerException is ConnectException) {
return CheckCredentialsResult.InvalidImapServerPort
}
} else if (exception is MessagingException) { // MessagingException is derived from MailConnectException, so place after MailConnectException
return CheckCredentialsResult.WrongPassword
}
return CheckCredentialsResult.UnknownError // fallback for cases i am not aware of
}
open fun listenForNewEmails(account: EmailAccount, options: ListenForNewMailsOptions) = runBlocking {
try {
connect(account, options) { store ->
val folder = store.getFolder(options.emailFolderName) as IMAPFolder
folder.open(Folder.READ_ONLY)
val status = connect(account, options)
val status = FetchEmailsStatus(account, folder, options)
folder.addMessageCountListener(object : MessageCountAdapter() {
override fun messagesAdded(event: MessageCountEvent) {
event.messages.forEach { message ->
getEmail(message, status)
}
status.folder.addMessageCountListener(object : MessageCountAdapter() {
override fun messagesAdded(event: MessageCountEvent) {
event.messages.forEach { message ->
getEmail(message, status)
}
})
launch(coroutineDispatcher) {
keepConnectionOpen(status, folder, options)
}
})
launch(coroutineDispatcher) {
keepConnectionOpen(status, options)
}
close(status)
} catch (e: Throwable) {
log.error(e) { "Listening to new emails of '${account.username}' failed" }
options.onError?.invoke(FetchEmailError(FetchEmailErrorType.ListenForNewEmails, null, e))
}
}
protected open suspend fun keepConnectionOpen(status: FetchEmailsStatus, folder: IMAPFolder, options: ListenForNewMailsOptions) {
protected open suspend fun keepConnectionOpen(status: FetchEmailsStatus, options: ListenForNewMailsOptions) {
val account = status.account
val folder = status.folder
log.info { "Listening to new emails of $account" }
// Use IMAP IDLE to keep the connection alive
@ -88,18 +126,13 @@ open class EmailsFetcher(
open fun fetchAllEmails(account: EmailAccount, options: FetchEmailsOptions = FetchEmailsOptions()): FetchEmailsResult {
try {
return connect(account, options) { store ->
val folder = store.getFolder(options.emailFolderName) as IMAPFolder
folder.open(Folder.READ_ONLY)
val status = connect(account, options)
val status = FetchEmailsStatus(account, folder, options)
val emails = fetchAllEmailsInFolder(status)
val emails = fetchAllEmailsInFolder(status).also {
folder.close(false)
}
close(status)
FetchEmailsResult(emails, null, status.messageSpecificErrors)
}
return FetchEmailsResult(emails, null, status.messageSpecificErrors)
} catch (e: Throwable) {
log.error(e) { "Could not fetch emails of account $account" }
@ -208,12 +241,14 @@ open class EmailsFetcher(
val (invoice, invoiceFile) = tryToReadEInvoice(part, extension, messagePart.mediaType, status)
val pdfInvoiceData: PdfInvoiceData? = tryToReadInvoiceDataFromPdf(extension, messagePart.mediaType, invoiceFile)
if (invoice != null || Part.ATTACHMENT.equals(part.disposition, ignoreCase = true)) {
val file = invoiceFile ?:
if (extension !in status.options.downloadAttachmentsWithExtensions) null
else downloadAttachment(part, status)
return EmailAttachment(part.fileName, extension, part.size.takeIf { it > 0 }, mapDisposition(part), messagePart.mediaType, part.contentType, invoice, file)
return EmailAttachment(part.fileName, extension, part.size.takeIf { it > 0 }, mapDisposition(part), messagePart.mediaType, part.contentType, invoice, pdfInvoiceData, file)
}
} catch (e: Throwable) {
log.error(e) { "Could not check attachment '${messagePart.part.fileName}' (${messagePart.mediaType}) for eInvoice" }
@ -250,6 +285,14 @@ open class EmailsFetcher(
}
}
private fun tryToReadInvoiceDataFromPdf(extension: String, mediaType: String, invoiceFile: File?): PdfInvoiceData? =
// if it's a PDF than () already downloaded invoiceFile, so it must be non null then
if (invoiceFile != null && (extension == "pdf" || mediaType == "application/pdf" || mediaType == "application/octet-stream")) {
pdfInvoiceDataExtractor.tryToExtractInvoiceData(invoiceFile).data // TODO: pass result.error to status.onError()
} else {
null
}
protected open fun getAllMessageParts(part: Part): List<MessagePart> {
return if (part.isMimeType("multipart/*")) {
@ -326,18 +369,22 @@ open class EmailsFetcher(
date.toInstant()
protected open fun <T> connect(account: EmailAccount, options: FetchEmailsOptions, connected: (Store) -> T): T {
protected open fun connect(account: EmailAccount, options: FetchEmailsOptions): FetchEmailsStatus {
val properties = mapAccountToJavaMailProperties(account, options)
val session = Session.getInstance(properties)
session.getStore("imap").use { store ->
store.connect(account.serverAddress, account.username, account.password)
session.debug = options.showDebugOutputOnConsole
return connected(store)
}
val store = session.getStore("imap")
store.connect(account.serverAddress, account.username, account.password)
val folder = store.getFolder(options.emailFolderName) as IMAPFolder
folder.open(Folder.READ_ONLY)
return FetchEmailsStatus(account, store, folder, options)
}
protected open fun mapAccountToJavaMailProperties(account: EmailAccount, options: FetchEmailsOptions) = Properties().apply {
protected open fun mapAccountToJavaMailProperties(account: EmailAccount, options: FetchEmailsOptions = FetchEmailsOptions()) = Properties().apply {
// the documentation of all properties can be found here: https://javaee.github.io/javamail/docs/api/com/sun/mail/imap/package-summary.html
put("mail.store.protocol", "imap")
@ -354,4 +401,14 @@ open class EmailsFetcher(
put("mail.imap.partialfetch", "false") // Controls whether the IMAP partial-fetch capability should be used. Defaults to true.
}
protected open fun close(status: FetchEmailsStatus) {
try {
status.folder.close(false)
status.store.close()
} catch (e: Exception) {
log.error(e) { "Could not close folder or store" }
}
}
}

View File

@ -1,13 +0,0 @@
package net.codinux.invoicing.email
enum class FetchEmailErrorType {
GetEmail,
GetMesssageBody,
GetAttachment,
ExtractInvoice,
ListenForNewEmails
}

View File

@ -1,6 +1,7 @@
package net.codinux.invoicing.email
import net.codinux.invoicing.email.model.Email
import net.codinux.invoicing.email.model.FetchEmailError
import java.io.File
import java.time.Instant
import java.time.LocalDate
@ -29,6 +30,8 @@ open class FetchEmailsOptions(
val emailFolderName: String = "INBOX",
val connectTimeoutSeconds: Int = 5,
val showDebugOutputOnConsole: Boolean = false,
val onError: ((FetchEmailError) -> Unit)? = null,
val onEmailReceived: ((Email) -> Unit)? = null
) {

View File

@ -3,13 +3,17 @@ package net.codinux.invoicing.email
import jakarta.mail.BodyPart
import jakarta.mail.Message
import jakarta.mail.Part
import jakarta.mail.Store
import net.codinux.invoicing.email.model.EmailAccount
import net.codinux.invoicing.email.model.FetchEmailError
import net.codinux.invoicing.email.model.FetchEmailErrorType
import net.codinux.invoicing.filesystem.FileUtil
import org.eclipse.angus.mail.imap.IMAPFolder
import java.io.File
data class FetchEmailsStatus(
val account: EmailAccount,
val store: Store,
val folder: IMAPFolder,
val options: FetchEmailsOptions,
val messageSpecificErrors: MutableList<FetchEmailError> = mutableListOf()

View File

@ -1,6 +1,7 @@
package net.codinux.invoicing.email
import net.codinux.invoicing.email.model.Email
import net.codinux.invoicing.email.model.FetchEmailError
import java.io.File
import java.util.concurrent.atomic.AtomicBoolean
@ -16,11 +17,13 @@ open class ListenForNewMailsOptions(
emailFolderName: String = "INBOX",
connectTimeoutSeconds: Int = 5,
showDebugOutputOnConsole: Boolean = false,
onError: ((FetchEmailError) -> Unit)? = null,
onEmailReceived: (Email) -> Unit
) : FetchEmailsOptions(
null,
downloadMessageBody, downloadOnlyPlainTextOrHtmlMessageBody, null,
downloadAttachmentsWithExtensions, attachmentsDownloadDirectory,
emailFolderName, connectTimeoutSeconds, onError, onEmailReceived
emailFolderName, connectTimeoutSeconds, showDebugOutputOnConsole, onError, onEmailReceived
)

View File

@ -0,0 +1,18 @@
package net.codinux.invoicing.email.model
enum class CheckCredentialsResult {
Ok,
WrongUsername,
WrongPassword,
InvalidImapServerAddress,
InvalidImapServerPort,
UnknownError
}

View File

@ -36,9 +36,11 @@ class Email(
val hasAttachments: Boolean by lazy { attachments.isNotEmpty() }
val hasPdfAttachment: Boolean by lazy { attachments.any { it.isPdfFile } }
val hasEInvoiceAttachment: Boolean by lazy { attachments.any { it.containsEInvoice } }
val hasPdfAttachment: Boolean by lazy { attachments.any { it.isPdfFile } }
val hasAttachmentsWithExtractedInvoiceData: Boolean by lazy { attachments.any { it.couldExtractPdfInvoiceData } }
override fun toString() = "${date.atZone(ZoneId.systemDefault()).toLocalDate()} $sender: $subject, ${attachments.size} attachment(s)"

View File

@ -1,6 +1,7 @@
package net.codinux.invoicing.email.model
import net.codinux.invoicing.model.Invoice
import net.codinux.invoicing.pdf.PdfInvoiceData
import java.io.File
class EmailAttachment(
@ -16,11 +17,14 @@ class EmailAttachment(
val mediaType: String?,
val contentType: String?,
val invoice: Invoice? = null,
val pdfInvoiceData: PdfInvoiceData? = null,
val file: File? = null
) {
val isPdfFile: Boolean by lazy { extension == "pdf" || mediaType == "application/pdf" }
val containsEInvoice: Boolean by lazy { invoice != null }
val isPdfFile: Boolean by lazy { extension == "pdf" || mediaType == "application/pdf" }
val couldExtractPdfInvoiceData: Boolean by lazy { pdfInvoiceData != null }
override fun toString() = "$filename: $invoice"
}

View File

@ -1,4 +1,4 @@
package net.codinux.invoicing.email
package net.codinux.invoicing.email.model
data class FetchEmailError(
val type: FetchEmailErrorType,

View File

@ -0,0 +1,13 @@
package net.codinux.invoicing.email.model
enum class FetchEmailErrorType {
GetEmail,
GetMesssageBody,
GetAttachment,
ExtractInvoice, // TODO: due to orNull() these errors aren't caught anymore
ListenForNewEmails
}

View File

@ -1,6 +1,4 @@
package net.codinux.invoicing.email
import net.codinux.invoicing.email.model.Email
package net.codinux.invoicing.email.model
data class FetchEmailsResult(
val emails: List<Email>,

View File

@ -0,0 +1,11 @@
package net.codinux.invoicing.pdf
import java.math.BigDecimal
class AmountOfMoney(
val amount: BigDecimal,
val currency: String,
val amountWithCurrency: String = "$amount $currency"
) {
override fun toString() = amountWithCurrency
}

View File

@ -0,0 +1,17 @@
package net.codinux.invoicing.pdf
import net.dankito.text.extraction.ITextExtractor
import net.dankito.text.extraction.pdf.PdfBoxPdfTextExtractor
import java.io.File
open class PdfBoxPdfTextExtractor(
protected open val textExtractor: ITextExtractor = PdfBoxPdfTextExtractor()
) : PdfTextExtractor {
override fun extractTextFromPdf(pdfFile: File): PdfTextExtractorResult {
val result = textExtractor.extractText(pdfFile)
return PdfTextExtractorResult(result.text, result.error?.exception)
}
}

View File

@ -0,0 +1,33 @@
package net.codinux.invoicing.pdf
import java.math.BigDecimal
import java.time.LocalDate
/**
* PDFs contain only unstructured data, so it's way harder to get invoice data from PDFs then from structured XML eInvoice files.
*
* So we can only guess which is the total amount, which the net and vat amount, which the invoice date, ...
*
* Therefor this class' properties all contain 'possible' in their name to reflect this circumstance.
*/
class PdfInvoiceData(
val potentialTotalAmount: AmountOfMoney,
val potentialNetAmount: AmountOfMoney? = null,
val potentialValueAddedTax: AmountOfMoney? = null,
val potentialValueAddedTaxRate: BigDecimal? = null,
val potentialIban: String? = null,
val potentialBic: String? = null,
val foundAmounts: List<AmountOfMoney> = emptyList(),
val foundPercentages: List<AmountOfMoney> = emptyList(),
val foundDates: List<LocalDate> = emptyList(),
val foundPotentialIbans: List<String> = emptyList(),
val foundPotentialBics: List<String> = emptyList(),
val pdfText: String
) {
override fun toString() = "$potentialTotalAmount"
}

View File

@ -0,0 +1,10 @@
package net.codinux.invoicing.pdf
class PdfInvoiceDataExtractionResult(
val error: Throwable?,
val data: PdfInvoiceData?
) {
override fun toString() =
if (data != null) "Success: $data"
else "Error: $error"
}

View File

@ -0,0 +1,58 @@
package net.codinux.invoicing.pdf
import net.dankito.text.extraction.info.invoice.InvoiceDataExtractor
import net.dankito.text.extraction.info.model.InvoiceData
import java.io.File
import java.time.LocalDate
/**
* PDFs contain only unstructured data, so it's way harder to get invoice data from PDFs then from structured XML eInvoice files.
*
* But for validation purposes or PDFs without attached eInvoice XML we also try to extract unstructured invoice data from PDFs.
*/
open class PdfInvoiceDataExtractor(
protected open val textExtractor: PdfTextExtractor = PdfBoxPdfTextExtractor(),
protected open val invoiceDataExtractor: InvoiceDataExtractor = InvoiceDataExtractor()
) {
open fun tryToExtractInvoiceData(file: File): PdfInvoiceDataExtractionResult {
val textExtractionResult = extractTextFromPdf(file)
if (textExtractionResult.error != null || textExtractionResult.text == null) {
return PdfInvoiceDataExtractionResult(textExtractionResult.error, null)
}
val pdfText = textExtractionResult.text
val result = invoiceDataExtractor.extractInvoiceData(pdfText)
return if (result.error != null) {
PdfInvoiceDataExtractionResult(result.error, null)
} else if (result.potentialTotalAmount == null) {
PdfInvoiceDataExtractionResult(IllegalStateException("Could not find total amount of invoice in PDF $file"), null)
} else {
PdfInvoiceDataExtractionResult(null, mapInvoiceData(result, pdfText))
}
}
protected open fun extractTextFromPdf(file: File): PdfTextExtractorResult =
textExtractor.extractTextFromPdf(file)
protected open fun mapInvoiceData(result: InvoiceData, pdfText: String) = PdfInvoiceData(
mapAmount(result.potentialTotalAmount)!!, mapAmount(result.potentialNetAmount),
mapAmount(result.potentialValueAddedTax), result.potentialValueAddedTaxRate?.amount,
result.potentialIban, result.potentialBic,
result.allAmounts.mapNotNull { mapAmount(it) }, result.percentages.mapNotNull { mapAmount(it) },
result.dates.map { LocalDate.of(it.year, it.month, it.day) },
result.ibans.map { it.hit }, result.bics.map { it.hit },
pdfText
)
protected open fun mapAmount(amount: net.dankito.text.extraction.info.model.AmountOfMoney?) =
amount?.let { AmountOfMoney(it.amount, it.currency, it.amountWithCurrency) }
}

View File

@ -0,0 +1,9 @@
package net.codinux.invoicing.pdf
import java.io.File
interface PdfTextExtractor {
fun extractTextFromPdf(pdfFile: File): PdfTextExtractorResult
}

View File

@ -0,0 +1,10 @@
package net.codinux.invoicing.pdf
data class PdfTextExtractorResult(
val text: String?,
val error: Throwable?
) {
override fun toString() =
if (text != null) "Success: $text"
else "Error: $error"
}

View File

@ -0,0 +1,17 @@
package net.codinux.invoicing.util
open class ExceptionHelper {
open fun getInnerException(exception: Exception, maxDepth: Int = 3): Exception {
var innerException = exception
var depth = 0
while(innerException.cause is Exception && depth < maxDepth) {
innerException = innerException.cause as Exception
depth++
}
return innerException
}
}

View File

@ -14,8 +14,13 @@ kotlinCoroutinesVersion=1.9.0
quarkusVersion=3.16.3
# Mustang 2.14 pulls PDFBox 3.x on the classpath, which is incompatible with PDFBox 2.x used by pdfbox-text-extractor
# but Mustang version 2.13 and 2.12 is missing its dependencies in pom.xml
mustangVersion=2.14.2
textInfoExtractor=1.0.3
pdfboxTextExtractor=0.6.1
angusMailVersion=2.0.3
klfVersion=1.6.2