diff --git a/app/TesseractOCRModel.defaults.yml b/app/TesseractOCRModel.defaults.yml new file mode 100644 index 0000000000000000000000000000000000000000..37929e89d70c609a834f72aaae1f0ec0190419ca --- /dev/null +++ b/app/TesseractOCRModel.defaults.yml @@ -0,0 +1,816 @@ +# - title: 'Afrikaans' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/afr.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Amharic' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/amh.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Arabic' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ara.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Assamese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/asm.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Azerbaijani' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Azerbaijani - Cyrillic' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze_cyrl.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Belarusian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bel.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Bengali' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ben.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Tibetan' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bod.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Bosnian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bos.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Bulgarian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bul.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Catalan; Valencian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cat.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Cebuano' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ceb.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Czech' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ces.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Chinese - Simplified' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_sim.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Chinese - Traditional' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_tra.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Cherokee' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chr.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Welsh' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cym.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Danish' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dan.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'German' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/deu.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Dzongkha' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dzo.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Greek, Modern (1453-)' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ell.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'English' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eng.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'English, Middle (1100-1500)' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/enm.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Esperanto' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/epo.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Estonian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/est.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Basque' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eus.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Persian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fas.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Finnish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fin.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'French' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fra.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'German Fraktur' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frk.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'French, Middle (ca. 1400-1600)' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frm.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Irish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/gle.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Galician' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/glg.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Greek, Ancient (-1453)' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/grc.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Gujarati' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/guj.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Haitian; Haitian Creole' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hat.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Hebrew' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/heb.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Hindi' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hin.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Croatian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hrv.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Hungarian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hun.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Inuktitut' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/iku.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Indonesian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ind.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Icelandic' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/isl.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Italian' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'Italian - Old' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita_old.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Javanese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jav.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Japanese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jpn.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Kannada' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kan.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Georgian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Georgian - Old' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat_old.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Kazakh' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kaz.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Central Khmer' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/khm.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Kirghiz; Kyrgyz' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kir.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Korean' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kor.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Kurdish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kur.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Lao' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lao.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Latin' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lat.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Latvian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lav.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Lithuanian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lit.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Malayalam' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mal.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Marathi' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mar.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Macedonian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mkd.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Maltese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mlt.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Malay' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/msa.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Burmese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mya.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Nepali' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nep.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Dutch; Flemish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nld.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Norwegian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nor.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Oriya' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ori.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Panjabi; Punjabi' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pan.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Polish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pol.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Portuguese' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/por.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Pushto; Pashto' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pus.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Romanian; Moldavian; Moldovan' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ron.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Russian' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/rus.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Sanskrit' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/san.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Sinhala; Sinhalese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sin.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Slovak' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slk.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Slovenian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slv.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +- title: 'Spanish; Castilian' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +- title: 'Spanish; Castilian - Old' + description: '' + url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa_old.traineddata' + publisher: 'tesseract-ocr' + publishing_year: 2021 + version: '4.1.0' + compatible_service_versions: + - '0.1.0' +# - title: 'Albanian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sqi.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Serbian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Serbian - Latin' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp_latn.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Swahili' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swa.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Swedish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swe.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Syriac' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/syr.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Tamil' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tam.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Telugu' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tel.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Tajik' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgk.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Tagalog' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgl.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Thai' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tha.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Tigrinya' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tir.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Turkish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tur.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Uighur; Uyghur' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uig.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Ukrainian' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ukr.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Urdu' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/urd.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Uzbek' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Uzbek - Cyrillic' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb_cyrl.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Vietnamese' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/vie.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' +# - title: 'Yiddish' +# description: '' +# url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/yid.traineddata' +# publisher: 'tesseract-ocr' +# publishing_year: 2021 +# version: '4.1.0' +# compatible_service_versions: +# - '0.1.0' diff --git a/app/__init__.py b/app/__init__.py index 37b0961f4fabffef52494596c879629d83c499fd..5c4052d2970624af75c0aea8f0a816930cf4c19f 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -39,9 +39,6 @@ def create_app(config: Config = Config) -> Flask: socketio.init_app( app, message_queue=app.config['NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI']) - # from .utils import HashidConverter - # app.url_map.converters['hashid'] = HashidConverter - from .events import socketio as socketio_events from .events import sqlalchemy as sqlalchemy_events @@ -54,6 +51,9 @@ def create_app(config: Config = Config) -> Flask: from .auth import bp as auth_blueprint app.register_blueprint(auth_blueprint, url_prefix='/auth') + from .contribute import bp as contribute_blueprint + app.register_blueprint(contribute_blueprint, url_prefix='/contribute') + from .corpora import bp as corpora_blueprint app.register_blueprint(corpora_blueprint, url_prefix='/corpora') diff --git a/app/api/__init__.py b/app/api/__init__.py index f47235ea9b9801dd6e4f0048d5f3ca368a77b0a1..e7674c87d8a298b4d3917e54690051add2257911 100644 --- a/app/api/__init__.py +++ b/app/api/__init__.py @@ -1,7 +1,6 @@ from flask import Blueprint from flask_restx import Api -from .jobs import ns as jobs_ns from .tokens import ns as tokens_ns bp = Blueprint('api', __name__) @@ -23,5 +22,4 @@ api = Api( version='1.0' ) -api.add_namespace(jobs_ns) api.add_namespace(tokens_ns) diff --git a/app/api/auth.py b/app/api/auth.py index 24e862eaedca68f110f4d74192ab55935f6ae463..fea4123b3fdc79bc4cc5d4905d7e9dd5992a6145 100644 --- a/app/api/auth.py +++ b/app/api/auth.py @@ -9,8 +9,12 @@ token_auth = HTTPTokenAuth() @basic_auth.verify_password def verify_password(email_or_username, password): - user = User.query.filter(or_(User.username == email_or_username, - User.email == email_or_username.lower())).first() + user = User.query.filter( + or_( + User.username == email_or_username, + User.email == email_or_username.lower() + ) + ).first() if user and user.verify_password(password): return user diff --git a/app/api/jobs.py b/app/api/jobs.py deleted file mode 100644 index 153d5060d6269d8c47305a7018e16eb8e882025b..0000000000000000000000000000000000000000 --- a/app/api/jobs.py +++ /dev/null @@ -1,48 +0,0 @@ -from flask_restx import Namespace, Resource -from .auth import token_auth -from ..jobs import tasks -from ..models import Job - - -ns = Namespace('jobs', description='Job operations') - - -@ns.route('') -class API_Jobs(Resource): - '''Shows a list of all jobs and lets you POST to add new job''' - - @ns.doc(security='apiKey') - @token_auth.login_required - def get(self): - '''List all jobs''' - # TODO: Implement the correct get_jobs functionality - jobs = Job.query.all() - return [job.to_dict(include_relationships=False) for job in jobs] - - @ns.doc(security='apiKey') - @token_auth.login_required - def post(self): - '''Create a new job''' - # TODO: Implement this - pass - - -@ns.route('/<hashid:id>') -class API_Job(Resource): - '''Show a single job and lets you delete it''' - - @ns.doc(security='apiKey') - @token_auth.login_required - def get(self, id): - '''Get a job by id''' - job = Job.query.get_or_404(id) - return job.to_dict(include_relationships=False) - - @ns.doc(security='apiKey') - @token_auth.login_required - def delete(self, id): - '''Delete a job by id''' - job = Job.query.get_or_404(id) - # We use this imported task because it will run in the background - tasks.delete_job(job.id) - return '', 204 diff --git a/app/auth/routes.py b/app/auth/routes.py index 2cda4bc28694a804f3631590b66d7ffd04b13091..3584225179afc684c42a4e4b0431cc1d4fbb81b7 100644 --- a/app/auth/routes.py +++ b/app/auth/routes.py @@ -60,28 +60,37 @@ def register(): return redirect(url_for('main.dashboard')) form = RegistrationForm(prefix='registration-form') if form.validate_on_submit(): - user = User(email=form.email.data.lower(), - password=form.password.data, - username=form.username.data) + user = User( + email=form.email.data.lower(), + password=form.password.data, + username=form.username.data + ) db.session.add(user) - db.session.commit() + db.session.flush(objects=[user]) + db.session.refresh(user) try: - os.makedirs(user.path) - except OSError: - current_app.logger.error( - f'Make dir {user.path} led to an OSError!') - db.session.delete(user) - db.session.commit() + user.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() abort(500) else: token = user.generate_confirmation_token() - msg = create_message(user.email, 'Confirm Your Account', - 'auth/email/confirm', token=token, user=user) + msg = create_message( + user.email, + 'Confirm Your Account', + 'auth/email/confirm', + token=token, + user=user + ) send(msg) flash('A confirmation email has been sent to you by email.') return redirect(url_for('.login')) - return render_template('auth/register.html.j2', form=form, - title='Register') + return render_template( + 'auth/register.html.j2', + form=form, + title='Register' + ) @bp.route('/confirm/<token>') diff --git a/app/cli.py b/app/cli.py index d885ff1201a7910a91d8e3ec646d55e6502902e7..e588eef97dbcebad5721ea21d99f6d06bcd68c8a 100644 --- a/app/cli.py +++ b/app/cli.py @@ -1,16 +1,44 @@ -from . import db -from .models import Corpus, Role +from flask import current_app from flask_migrate import upgrade +from . import db +from .models import Corpus, Job, Role, User, TesseractOCRModel +import json +import os +import re + + +def _make_default_dirs(): + base_dir = current_app.config['NOPAQUE_DATA_DIR'] + + default_directories = [ + os.path.join(base_dir, 'tmp'), + os.path.join(base_dir, 'users') + ] + for directory in default_directories: + if os.path.exists(directory): + if not os.path.isdir(directory): + raise NotADirectoryError(f'{directory} is not a directory') + else: + os.mkdir(directory) def register(app): @app.cli.command() def deploy(): ''' Run deployment tasks. ''' + # Make default directories + _make_default_dirs() + # migrate database to latest revision upgrade() - # create or update user roles - Role.insert_roles() + + # Insert/Update default database values + current_app.logger.info('Insert/Update default roles') + Role.insert_defaults() + current_app.logger.info('Insert/Update default users') + User.insert_defaults() + current_app.logger.info('Insert/Update default tesseract ocr models') + TesseractOCRModel.insert_defaults() @app.cli.group() def daemon(): @@ -40,3 +68,55 @@ def register(app): from unittest.suite import TestSuite tests: TestSuite = TestLoader().discover('tests') TextTestRunner(verbosity=2).run(tests) + + @app.cli.group() + def convert(): + ''' Datebase convert commands. ''' + + @convert.command() + def nlp_jobs(): + for job in Job.query.filter_by(service='nlp').all(): + job.service = 'spacy-nlp' + service_args = json.loads(job.service_args) + new_service_args = {} + for service_arg in service_args: + if service_arg == '--check-encoding': + new_service_args['encoding_detection'] = True + elif re.match(r'-l ([a-z]{2})', service_arg): + language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1) # noqa + new_service_args['language'] = language_code + job.service_args = json.dumps(new_service_args) + db.session.commit() + + @convert.command() + def ocr_jobs(): + # Language code to TesseractOCRModel.title lookup + language_code_lookup = { + 'ara': 'Arabic', + 'chi_tra': 'Chinese - Traditional', + 'dan': 'Danish', + 'eng': 'English', + 'enm': 'English, Middle (1100-1500)', + 'fra': 'French', + 'frm': 'French, Middle (ca. 1400-1600)', + 'deu': 'German', + 'frk': 'German Fraktur', + 'ell': 'Greek, Modern (1453-)', + 'ita': 'Italian', + 'por': 'Portuguese', + 'rus': 'Russian', + 'spa': 'Spanish; Castilian' + } + for job in Job.query.filter_by(service='ocr').all(): + job.service = 'tesseract-ocr' + service_args = json.loads(job.service_args) + new_service_args = {} + for service_arg in service_args: + if service_arg == '--binarize': + new_service_args['binarization'] = True + elif re.match(r'-l ([a-z]{3})', service_arg): + language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1) # noqa + tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first() # noqa + new_service_args['model'] = tesseract_ocr_model.id + job.service_args = json.dumps(new_service_args) + db.session.commit() diff --git a/app/contribute/__init__.py b/app/contribute/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..15d172ecacca63634bafe62e3c2af0a79671dc8d --- /dev/null +++ b/app/contribute/__init__.py @@ -0,0 +1,5 @@ +from flask import Blueprint + + +bp = Blueprint('contribute', __name__) +from . import routes diff --git a/app/contribute/routes.py b/app/contribute/routes.py new file mode 100644 index 0000000000000000000000000000000000000000..e0b43231c16d9634b8f2355a3481fb21b2452924 --- /dev/null +++ b/app/contribute/routes.py @@ -0,0 +1,19 @@ +from flask import flash, redirect, render_template, url_for +from flask_login import login_required +from . import bp +from .. import db +from ..decorators import permission_required +from ..models import Permission, Role, User +from ..settings import tasks as settings_tasks + + +@bp.before_request +@login_required +@permission_required(Permission.CONTRIBUTE) +def before_request(): + pass + + +@bp.route('/') +def index(): + pass diff --git a/app/corpora/cqi_over_socketio/__init__.py b/app/corpora/cqi_over_socketio/__init__.py index 14031c4a9735d8f7db4e4f4d406dfd71f9f646bd..3a358758405879116c0469fe236daf9a94864fc0 100644 --- a/app/corpora/cqi_over_socketio/__init__.py +++ b/app/corpora/cqi_over_socketio/__init__.py @@ -93,12 +93,12 @@ def connect(auth): @socketio.on('disconnect', namespace=NAMESPACE) def disconnect(): + if 'd' not in session: + return session['d']['cqi_client_lock'].acquire() try: session['d']['cqi_client'].disconnect() - except cqi.errors.CQiException: - pass - except BrokenPipeError: + except (BrokenPipeError, cqi.errors.CQiException): pass session['d']['cqi_client_lock'].release() corpus = Corpus.query.get(session['d']['corpus_id']) diff --git a/app/corpora/cqi_over_socketio/utils.py b/app/corpora/cqi_over_socketio/utils.py index 7cbe07b9c6abbbed12146b68110d99c4c2e4639f..9763548aa315ed09528f6cadb0f1a130e93123f8 100644 --- a/app/corpora/cqi_over_socketio/utils.py +++ b/app/corpora/cqi_over_socketio/utils.py @@ -12,7 +12,10 @@ def cqi_over_socketio(f): f_args = {} # Check for missing args and if all provided args are of the right type for param in signature(f).parameters.values(): - if param.annotation == cqi.CQiClient: + if param.name == 'corpus_name': + f_args[param.name] = f'NOPAQUE_{session["d"]["corpus_id"]}' + continue + if param.name == 'cqi_client': f_args[param.name] = session['d']['cqi_client'] continue if param.default is param.empty: diff --git a/app/corpora/routes.py b/app/corpora/routes.py index 1086c2981c5075411fbd3945554854b9491538df..f6d95b542029fbb814459830dbca789e4c7af19b 100644 --- a/app/corpora/routes.py +++ b/app/corpora/routes.py @@ -1,6 +1,7 @@ from flask import (abort, current_app, flash, make_response, redirect, render_template, url_for, send_from_directory) from flask_login import current_user, login_required +from werkzeug.utils import secure_filename from . import bp from . import tasks from .forms import (AddCorpusFileForm, AddCorpusForm, EditCorpusFileForm, @@ -29,18 +30,20 @@ def add_corpus(): db.session.flush() db.session.refresh(corpus) try: - os.makedirs(corpus.path) + corpus.makedirs() except OSError as e: - current_app.logger.error(f'Could not add corpus: {e}') + current_app.logger.error(e) db.session.rollback() flash('Internal Server Error', 'error') abort(500) - else: - db.session.commit() - flash(f'Corpus "{corpus.title}" added!', 'corpus') - return redirect(url_for('.corpus', corpus_id=corpus.id)) - return render_template('corpora/add_corpus.html.j2', form=form, - title='Add corpus') + db.session.commit() + flash(f'Corpus "{corpus.title}" added', 'corpus') + return redirect(url_for('.corpus', corpus_id=corpus.id)) + return render_template( + 'corpora/add_corpus.html.j2', + form=form, + title='Add corpus' + ) @bp.route('/import', methods=['GET', 'POST']) @@ -174,7 +177,7 @@ def add_corpus_file(corpus_id): if not form.validate(): return make_response(form.errors, 400) # Save the file - form.file.data.save(os.path.join(corpus.path, form.file.data.filename)) + filename = secure_filename(form.file.data.filename) corpus_file = CorpusFile( address=form.address.data, author=form.author.data, @@ -182,9 +185,10 @@ def add_corpus_file(corpus_id): chapter=form.chapter.data, corpus=corpus, editor=form.editor.data, - filename=form.file.data.filename, + filename=filename, institution=form.institution.data, journal=form.journal.data, + mimetype='application/vrt+xml', pages=form.pages.data, publisher=form.publisher.data, publishing_year=form.publishing_year.data, @@ -192,12 +196,25 @@ def add_corpus_file(corpus_id): title=form.title.data ) db.session.add(corpus_file) + db.session.flush(objects=[corpus_file]) + db.session.refresh(corpus_file) + try: + form.file.data.save(corpus_file.path) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.add_corpus_file', corpus_id=corpus.id)}, 500) # noqa corpus.status = 'unprepared' db.session.commit() - flash(f'Corpus file "{corpus_file.filename}" added!', 'corpus') + flash(f'Corpus file "{corpus_file.title}" added!', 'corpus') return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) # noqa - return render_template('corpora/add_corpus_file.html.j2', corpus=corpus, - form=form, title='Add corpus file') + return render_template( + 'corpora/add_corpus_file.html.j2', + corpus=corpus, + form=form, + title='Add corpus file' + ) @bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>/delete') diff --git a/app/daemon/__init__.py b/app/daemon/__init__.py index 009774562c51aa633f80106ad8e22828cdfdbb38..84ed0efee60ad60c3c82f0623c4aab816fad6a45 100644 --- a/app/daemon/__init__.py +++ b/app/daemon/__init__.py @@ -17,11 +17,7 @@ class Daemon(CheckCorporaMixin, CheckJobsMixin): def run(self): while True: - try: - self.check_corpora() - self.check_jobs() - db.session.commit() - except Exception as e: - current_app.logger.warning(e) - pass + self.check_corpora() + self.check_jobs() + db.session.commit() sleep(1.5) diff --git a/app/daemon/corpus_utils.py b/app/daemon/corpus_utils.py index 31cad92984a2575f126605ce9fd66769d83c3a1b..3962582eb0d1347cbb5fe4d1933f09dfd59a487d 100644 --- a/app/daemon/corpus_utils.py +++ b/app/daemon/corpus_utils.py @@ -26,37 +26,55 @@ class CheckCorporaMixin: def create_build_corpus_service(self, corpus): ''' # Docker service settings # ''' ''' ## Command ## ''' - command = 'docker-entrypoint.sh build-corpus' + command = ['bash', '-c'] + command.append( + f'mkdir /corpora/data/nopaque_{corpus.id}' + ' && ' + 'cwb-encode' + ' -c utf8' + f' -d /corpora/data/nopaque_{corpus.id}' + ' -f /root/files/corpus.vrt' + f' -R /usr/local/share/cwb/registry/nopaque_{corpus.id}' + ' -P pos -P lemma -P simple_pos' + ' -S ent:0+type -S s:0' + ' -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title' # noqa + ' -xsB -9' + ' && ' + f'cwb-make -V NOPAQUE_{corpus.id}' + ) ''' ## Constraints ## ''' constraints = ['node.role==worker'] ''' ## Image ## ''' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674' # noqa + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702' ''' ## Labels ## ''' labels = { 'origin': current_app.config['SERVER_NAME'], - 'type': 'build-corpus', + 'type': 'corpus.build', 'corpus_id': str(corpus.id) } ''' ## Mounts ## ''' - ''' ### Corpus file mount ### ''' - corpus_file_source = os.path.join(corpus.path, 'merged', 'corpus.vrt') - corpus_file_target = '/root/files/corpus.vrt' - corpus_file_mount = f'{corpus_file_source}:{corpus_file_target}:ro' - ''' ### Corpus data mount ### ''' - corpus_data_source = os.path.join(corpus.path, 'data') - corpus_data_target = '/corpora/data' - corpus_data_mount = f'{corpus_data_source}:{corpus_data_target}:rw' - # Make sure that their is no data in the corpus data directory - shutil.rmtree(corpus_data_source, ignore_errors=True) - os.mkdir(corpus_data_source) - ''' ### Corpus registry mount ### ''' - corpus_registry_source = os.path.join(corpus.path, 'registry') - corpus_registry_target = '/usr/local/share/cwb/registry' - corpus_registry_mount = f'{corpus_registry_source}:{corpus_registry_target}:rw' # noqa - # Make sure that their is no data in the corpus registry directory - shutil.rmtree(corpus_registry_source, ignore_errors=True) - os.mkdir(corpus_registry_source) - mounts = [corpus_file_mount, corpus_data_mount, corpus_registry_mount] + mounts = [] + ''' ### Data mount ### ''' + data_mount_source = os.path.join(corpus.path, 'cwb', 'data') + data_mount_target = '/corpora/data' + data_mount = f'{data_mount_source}:{data_mount_target}:rw' + # Make sure that their is no data in the data directory + shutil.rmtree(data_mount_source, ignore_errors=True) + os.makedirs(data_mount_source) + mounts.append(data_mount) + ''' ### File mount ### ''' + file_mount_source = os.path.join(corpus.path, 'cwb', 'corpus.vrt') + file_mount_target = '/root/files/corpus.vrt' + file_mount = f'{file_mount_source}:{file_mount_target}:ro' + mounts.append(file_mount) + ''' ### Registry mount ### ''' + registry_mount_source = os.path.join(corpus.path, 'cwb', 'registry') + registry_mount_target = '/usr/local/share/cwb/registry' + registry_mount = f'{registry_mount_source}:{registry_mount_target}:rw' + # Make sure that their is no data in the registry directory + shutil.rmtree(registry_mount_source, ignore_errors=True) + os.makedirs(registry_mount_source) + mounts.append(registry_mount) ''' ## Name ## ''' name = f'build-corpus_{corpus.id}' ''' ## Restart policy ## ''' @@ -74,7 +92,7 @@ class CheckCorporaMixin: except docker.errors.APIError as e: current_app.logger.error( f'Create service "{name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return corpus.status = 'queued' @@ -86,14 +104,14 @@ class CheckCorporaMixin: except docker.errors.NotFound as e: current_app.logger.error( f'Get service "{service_name}" failed ' - + f'due to "docker.errors.NotFound": {e}' + f'due to "docker.errors.NotFound": {e}' ) corpus.status = 'failed' return except docker.errors.APIError as e: current_app.logger.error( f'Get service "{service_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) service_tasks = service.tasks() if not service_tasks: @@ -108,36 +126,47 @@ class CheckCorporaMixin: corpus.status = 'failed' else: return - try: - service.remove() - except docker.errors.APIError as e: - current_app.logger.error( - f'Remove service "{service_name}" failed ' - + f'due to "docker.errors.APIError": {e}' - ) + # try: + # service.remove() + # except docker.errors.APIError as e: + # current_app.logger.error( + # f'Remove service "{service_name}" failed ' + # f'due to "docker.errors.APIError": {e}' + # ) def create_cqpserver_container(self, corpus): ''' # Docker container settings # ''' ''' ## Command ## ''' - command = 'cqpserver' + command = [] + command.append( + 'echo "host *;" > cqpserver.init' + ' && ' + 'echo "user anonymous \\"\\";" >> cqpserver.init' + ' && ' + 'cqpserver -I cqpserver.init' + ) ''' ## Detach ## ''' detach = True + ''' ## Entrypoint ## ''' + entrypoint = ['bash', '-c'] ''' ## Image ## ''' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cqpserver:r1674' # noqa + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}cwb:r1702' ''' ## Name ## ''' name = f'cqpserver_{corpus.id}' ''' ## Network ## ''' network = 'nopaque_default' ''' ## Volumes ## ''' + volumes = [] ''' ### Corpus data volume ### ''' - corpus_data_source = os.path.join(corpus.path, 'data') - corpus_data_target = '/corpora/data' - corpus_data_volume = f'{corpus_data_source}:{corpus_data_target}:rw' + data_volume_source = os.path.join(corpus.path, 'cwb', 'data') + data_volume_target = '/corpora/data' + data_volume = f'{data_volume_source}:{data_volume_target}:rw' + volumes.append(data_volume) ''' ### Corpus registry volume ### ''' - corpus_registry_source = os.path.join(corpus.path, 'registry') - corpus_registry_target = '/usr/local/share/cwb/registry' - corpus_registry_volume = f'{corpus_registry_source}:{corpus_registry_target}:rw' # noqa - volumes = [corpus_data_volume, corpus_registry_volume] + registry_volume_source = os.path.join(corpus.path, 'cwb', 'registry') + registry_volume_target = '/usr/local/share/cwb/registry' + registry_volume = f'{registry_volume_source}:{registry_volume_target}:rw' # noqa + volumes.append(registry_volume) # Check if a cqpserver container already exists. If this is the case, # remove it and create a new one try: @@ -147,7 +176,7 @@ class CheckCorporaMixin: except docker.errors.APIError as e: current_app.logger.error( f'Get container "{name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return else: @@ -156,7 +185,7 @@ class CheckCorporaMixin: except docker.errors.APIError as e: current_app.logger.error( f'Remove container "{name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return try: @@ -164,6 +193,7 @@ class CheckCorporaMixin: image, command=command, detach=detach, + entrypoint=entrypoint, volumes=volumes, name=name, network=network @@ -171,14 +201,14 @@ class CheckCorporaMixin: except docker.errors.ImageNotFound as e: current_app.logger.error( f'Run container "{name}" failed ' - + f'due to "docker.errors.ImageNotFound" error: {e}' + f'due to "docker.errors.ImageNotFound" error: {e}' ) corpus.status = 'failed' return except docker.errors.APIError as e: current_app.logger.error( f'Run container "{name}" failed ' - + f'due to "docker.errors.APIError" error: {e}' + f'due to "docker.errors.APIError" error: {e}' ) return corpus.status = 'analysing' @@ -190,14 +220,14 @@ class CheckCorporaMixin: except docker.errors.NotFound as e: current_app.logger.error( f'Get container "{container_name}" failed ' - + f'due to "docker.errors.NotFound": {e}' + f'due to "docker.errors.NotFound": {e}' ) corpus.num_analysis_sessions = 0 corpus.status = 'prepared' except docker.errors.APIError as e: current_app.logger.error( f'Get container "{container_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) def remove_cqpserver_container(self, corpus): @@ -210,7 +240,7 @@ class CheckCorporaMixin: except docker.errors.APIError as e: current_app.logger.error( f'Get container "{container_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return try: @@ -218,5 +248,5 @@ class CheckCorporaMixin: except docker.errors.APIError as e: current_app.logger.error( f'Remove container "{container_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) diff --git a/app/daemon/job_utils.py b/app/daemon/job_utils.py index 78bae839bdcf6056aa62cec14283faf8abf989d2..c640f35cf8a671d571da5340ef34eaf6c6617cb0 100644 --- a/app/daemon/job_utils.py +++ b/app/daemon/job_utils.py @@ -2,7 +2,7 @@ from datetime import datetime from flask import current_app from werkzeug.utils import secure_filename from .. import db -from ..models import Job, JobResult +from ..models import Job, JobResult, TesseractOCRModel import docker import json import os @@ -23,27 +23,34 @@ class CheckJobsMixin: ''' # Docker service settings # ''' ''' ## Service specific settings ## ''' if job.service == 'file-setup': - mem_mb = 2048 + mem_mb = 512 n_cores = 2 executable = 'file-setup' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:{job.service_version}' # noqa - elif job.service == 'ocr': - mem_mb = 4096 + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:v{job.service_version}' # noqa + elif job.service == 'tesseract-ocr': + mem_mb = 2048 n_cores = 4 executable = 'ocr' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:{job.service_version}' # noqa - elif job.service == 'nlp': - mem_mb = 2048 - n_cores = 2 + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:v{job.service_version}' # noqa + elif job.service == 'spacy-nlp': + mem_mb = 1024 + n_cores = 1 executable = 'nlp' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:{job.service_version}' # noqa + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:v{job.service_version}' # noqa ''' ## Command ## ''' command = f'{executable} -i /input -o /output' - command += ' --log-dir /input' + command += ' --log-dir /logs' command += f' --mem-mb {mem_mb}' command += f' --n-cores {n_cores}' - command += f' --zip [{job.service}]_{secure_filename(job.title)}' - command += ' ' + ' '.join(json.loads(job.service_args)) + service_args = json.loads(job.service_args) + if job.service == 'spacy-nlp': + command += f' -m {service_args["model"]}' + if 'encoding_detection' in service_args and service_args['encoding_detection']: # noqa + command += ' --check-encoding' + elif job.service == 'tesseract-ocr': + command += f' -m {service_args["model"]}' + if 'binarization' in service_args and service_args['binarization']: + command += ' --binarize' ''' ## Constraints ## ''' constraints = ['node.role==worker'] ''' ## Labels ## ''' @@ -53,20 +60,42 @@ class CheckJobsMixin: 'job_id': str(job.id) } ''' ## Mounts ## ''' - ''' ### Input mount ### ''' - input_mount_source = job.path - input_mount_target = '/input' + mounts = [] + ''' ### Input mount(s) ### ''' + input_mount_target_base = '/input' if job.service == 'file-setup': - input_mount_target += f'/{secure_filename(job.title)}' - input_mount = f'{input_mount_source}:{input_mount_target}:rw' + input_mount_target_base += f'/{secure_filename(job.title)}' + for job_input in job.inputs: + input_mount_source = job_input.path + input_mount_target = f'/{input_mount_target_base}/{job_input.filename}' # noqa + input_mount = f'{input_mount_source}:{input_mount_target}:ro' + mounts.append(input_mount) + if job.service == 'tesseract-ocr': + service_args = json.loads(job.service_args) + model = TesseractOCRModel.query.get(service_args['model']) + if model is None: + job.status = 'failed' + return + models_mount_source = model.path + models_mount_target = f'/usr/local/share/tessdata/{model.filename}' + models_mount = f'{models_mount_source}:{models_mount_target}:ro' + mounts.append(models_mount) ''' ### Output mount ### ''' - output_mount_source = os.path.join(job.path, 'output') + output_mount_source = os.path.join(job.path, 'results') output_mount_target = '/output' output_mount = f'{output_mount_source}:{output_mount_target}:rw' # Make sure that their is no data in the output directory shutil.rmtree(output_mount_source, ignore_errors=True) os.makedirs(output_mount_source) - mounts = [input_mount, output_mount] + mounts.append(output_mount) + ''' ### Pipeline data mount ### ''' + pyflow_data_mount_source = os.path.join(job.path, 'pipeline_data') + pyflow_data_mount_target = '/logs/pyflow.data' + pyflow_data_mount = f'{pyflow_data_mount_source}:{pyflow_data_mount_target}:rw' # noqa + # Make sure that their is no data in the output directory + shutil.rmtree(pyflow_data_mount_source, ignore_errors=True) + os.makedirs(pyflow_data_mount_source) + mounts.append(pyflow_data_mount) ''' ## Name ## ''' name = f'job_{job.id}' ''' ## Resources ## ''' @@ -90,7 +119,7 @@ class CheckJobsMixin: except docker.errors.APIError as e: current_app.logger.error( f'Create service "{name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return job.status = 'queued' @@ -102,14 +131,14 @@ class CheckJobsMixin: except docker.errors.NotFound as e: current_app.logger.error( f'Get service "{service_name}" failed ' - + f'due to "docker.errors.NotFound": {e}' + f'due to "docker.errors.NotFound": {e}' ) job.status = 'failed' return except docker.errors.APIError as e: current_app.logger.error( f'Get service "{service_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return service_tasks = service.tasks() @@ -121,13 +150,25 @@ class CheckJobsMixin: return elif job.status == 'running' and task_state == 'complete': job.status = 'complete' - results_dir = os.path.join(job.path, 'output') - result_files = [x for x in os.listdir(results_dir) if x.endswith('.zip')] # noqa - for result_file in result_files: - job_result = JobResult(filename=result_file, job=job) + results_dir = os.path.join(job.path, 'results') + with open(os.path.join(results_dir, 'outputs.json')) as f: + outputs = json.load(f) + for output in outputs: + filename = os.path.basename(output['file']) + job_result = JobResult( + filename=filename, + job=job, + mimetype=output['mimetype'] + ) + if 'description' in output: + job_result.description = output['description'] db.session.add(job_result) - db.session.flush() + db.session.flush(objects=[job_result]) db.session.refresh(job_result) + os.rename( + os.path.join(results_dir, output['file']), + job_result.path + ) elif job.status == 'running' and task_state == 'failed': job.status = 'failed' else: @@ -138,7 +179,7 @@ class CheckJobsMixin: except docker.errors.APIError as e: current_app.logger.error( f'Remove service "{service_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) def remove_job_service(self, job): @@ -151,7 +192,7 @@ class CheckJobsMixin: except docker.errors.APIError as e: current_app.logger.error( f'Get service "{service_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return try: @@ -159,7 +200,7 @@ class CheckJobsMixin: except docker.errors.APIError as e: current_app.logger.error( f'Update service "{service_name}" failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) return try: @@ -167,5 +208,5 @@ class CheckJobsMixin: except docker.errors.APIError as e: current_app.logger.error( f'Remove "{service_name}" service failed ' - + f'due to "docker.errors.APIError": {e}' + f'due to "docker.errors.APIError": {e}' ) diff --git a/app/jobs/routes.py b/app/jobs/routes.py index db8c686c793f112328db34976a81ded053ce6af7..4acd7c47675efcd78a80e68fc4cc0bd3298f597a 100644 --- a/app/jobs/routes.py +++ b/app/jobs/routes.py @@ -34,12 +34,14 @@ def delete_job(job_id): @login_required def download_job_input(job_id, job_input_id): job_input = JobInput.query.filter(JobInput.job_id == job_id, JobInput.id == job_input_id).first_or_404() # noqa - if not (job_input.job.user == current_user - or current_user.is_administrator()): + if not (job_input.job.user == current_user or current_user.is_administrator()): # noqa abort(403) - return send_from_directory(as_attachment=True, - directory=os.path.dirname(job_input.path), - filename=job_input.filename) + return send_from_directory( + as_attachment=True, + attachment_filename=job_input.filename, + directory=os.path.dirname(job_input.path), + filename=os.path.basename(job_input.path) + ) @bp.route('/<hashid:job_id>/restart') @@ -59,9 +61,11 @@ def restart(job_id): @login_required def download_job_result(job_id, job_result_id): job_result = JobResult.query.filter(JobResult.job_id == job_id, JobResult.id == job_result_id).first_or_404() # noqa - if not (job_result.job.user == current_user - or current_user.is_administrator()): + if not (job_result.job.user == current_user or current_user.is_administrator()): # noqa abort(403) - return send_from_directory(as_attachment=True, - directory=os.path.dirname(job_result.path), - filename=job_result.filename) + return send_from_directory( + as_attachment=True, + attachment_filename=job_result.filename, + directory=os.path.dirname(job_result.path), + filename=os.path.basename(job_result.path) + ) diff --git a/app/models.py b/app/models.py index 55013e924666f1c3ed3b4caf9064e71fb85d401e..d02b511ca51b945db51f43f459e0062f8c5ebd8d 100644 --- a/app/models.py +++ b/app/models.py @@ -4,13 +4,17 @@ from flask_hashids import HashidMixin from flask_login import UserMixin from itsdangerous import BadSignature, TimedJSONWebSignatureSerializer from time import sleep +from tqdm import tqdm from werkzeug.security import generate_password_hash, check_password_hash -import xml.etree.ElementTree as ET from . import db, login import base64 import enum +import json import os +import requests import shutil +import xml.etree.ElementTree as ET +import yaml class Permission(enum.IntEnum): @@ -25,7 +29,7 @@ class Permission(enum.IntEnum): class FileMixin: creation_date = db.Column(db.DateTime, default=datetime.utcnow) - filename = db.Column(db.String(256)) + filename = db.Column(db.String(255)) last_edited_date = db.Column(db.DateTime, default=datetime.utcnow) mimetype = db.Column(db.String(255)) @@ -86,7 +90,7 @@ class Role(HashidMixin, db.Model): return dict_role @staticmethod - def insert_roles(): + def insert_defaults(): roles = { 'User': [], 'API user': [Permission.USE_API], @@ -132,6 +136,12 @@ class User(HashidMixin, UserMixin, db.Model): db.String(16), default='all') # Backrefs: role: Role # Relationships + tesseract_ocr_models = db.relationship( + 'TesseractOCRModel', + backref='user', + cascade='all, delete-orphan', + lazy='dynamic' + ) corpora = db.relationship( 'Corpus', backref='user', @@ -221,6 +231,12 @@ class User(HashidMixin, UserMixin, db.Model): def is_administrator(self): return self.can(Permission.ADMINISTRATE) + def makedirs(self): + os.mkdir(self.path) + os.mkdir(os.path.join(self.path, 'tesseract_ocr_models')) + os.mkdir(os.path.join(self.path, 'corpora')) + os.mkdir(os.path.join(self.path, 'jobs')) + def revoke_token(self): self.token_expiration = datetime.utcnow() - timedelta(seconds=1) @@ -269,6 +285,21 @@ class User(HashidMixin, UserMixin, db.Model): return None return user + @staticmethod + def insert_defaults(): + if User.query.filter_by(username='nopaque').first() is not None: + return + user = User(username='nopaque') + db.session.add(user) + db.session.flush(objects=[user]) + db.session.refresh(user) + try: + user.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + db.session.commit() + @staticmethod def reset_password(token, new_password): s = TimedJSONWebSignatureSerializer(current_app.config['SECRET_KEY']) @@ -284,6 +315,72 @@ class User(HashidMixin, UserMixin, db.Model): return True +class TesseractOCRModel(FileMixin, HashidMixin, db.Model): + __tablename__ = 'tesseract_ocr_models' + # Primary key + id = db.Column(db.Integer, primary_key=True) + # Foreign keys + user_id = db.Column(db.Integer, db.ForeignKey('users.id')) + # Fields + compatible_service_versions = db.Column(db.String(255)) + description = db.Column(db.String(255)) + publisher = db.Column(db.String(128)) + publishing_year = db.Column(db.Integer) + title = db.Column(db.String(64)) + version = db.Column(db.String(16)) + # Backrefs: user: User + + @property + def path(self): + return os.path.join( + self.user.path, + 'tesseract_ocr_models', + str(self.id) + ) + + @staticmethod + def insert_defaults(): + user = User.query.filter_by(username='nopaque').first() + defaults_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'TesseractOCRModel.defaults.yml' + ) + with open(defaults_file, 'r') as f: + defaults = yaml.safe_load(f) + for m in defaults: + if TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() is not None: # noqa + continue + tesseract_ocr_model = TesseractOCRModel( + compatible_service_versions=json.dumps(m['compatible_service_versions']), # noqa + description=m['description'], + publisher=m['publisher'], + publishing_year=m['publishing_year'], + title=m['title'], + user=user, + version=m['version'] + ) + db.session.add(tesseract_ocr_model) + db.session.flush(objects=[tesseract_ocr_model]) + db.session.refresh(tesseract_ocr_model) + tesseract_ocr_model.filename = f'{tesseract_ocr_model.id}.traineddata' # noqa + r = requests.get(m['url'], stream=True) + pbar = tqdm( + desc=f'{tesseract_ocr_model.title} ({tesseract_ocr_model.filename})', # noqa + unit="B", + unit_scale=True, + unit_divisor=1024, + total=int(r.headers['Content-Length']) + ) + pbar.clear() + with open(tesseract_ocr_model.path, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + pbar.update(len(chunk)) + f.write(chunk) + pbar.close() + db.session.commit() + + class JobInput(FileMixin, HashidMixin, db.Model): __tablename__ = 'job_inputs' # Primary key @@ -309,7 +406,7 @@ class JobInput(FileMixin, HashidMixin, db.Model): @property def path(self): - return os.path.join(self.job.path, self.filename) + return os.path.join(self.job.path, 'inputs', str(self.id)) def to_dict(self, backrefs=False, relationships=False): dict_job_input = { @@ -347,6 +444,8 @@ class JobResult(FileMixin, HashidMixin, db.Model): id = db.Column(db.Integer, primary_key=True) # Foreign keys job_id = db.Column(db.Integer, db.ForeignKey('jobs.id')) + # Fields + description = db.Column(db.String(255)) # Backrefs: job: Job def __repr__(self): @@ -366,12 +465,13 @@ class JobResult(FileMixin, HashidMixin, db.Model): @property def path(self): - return os.path.join(self.job.path, 'output', self.filename) + return os.path.join(self.job.path, 'results', str(self.id)) def to_dict(self, backrefs=False, relationships=False): dict_job_result = { 'id': self.hashid, 'job_id': self.job.hashid, + 'description': self.description, 'download_url': self.download_url, 'url': self.url, **self.file_mixin_to_dict( @@ -414,8 +514,8 @@ class Job(HashidMixin, db.Model): end_date = db.Column(db.DateTime()) service = db.Column(db.String(64)) ''' - ' Service specific arguments as string list. - ' Example: ["-l eng", "--binarize"] + ' Dictionary as JSON formatted string. + ' Example: {"binarization": True} ''' service_args = db.Column(db.String(255)) service_version = db.Column(db.String(16)) @@ -472,6 +572,12 @@ class Job(HashidMixin, db.Model): shutil.rmtree(self.path, ignore_errors=True) db.session.delete(self) + def makedirs(self): + os.mkdir(self.path) + os.mkdir(os.path.join(self.path, 'inputs')) + os.mkdir(os.path.join(self.path, 'pipeline_data')) + os.mkdir(os.path.join(self.path, 'results')) + def restart(self): ''' Restart a job - only if the status is complete or failed @@ -479,7 +585,7 @@ class Job(HashidMixin, db.Model): if self.status not in ['complete', 'failed']: raise Exception('Could not restart job: status is not "complete/failed"') # noqa - shutil.rmtree(os.path.join(self.path, 'output'), ignore_errors=True) + shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True) shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True) # noqa for result in self.results: db.session.delete(result) @@ -487,6 +593,10 @@ class Job(HashidMixin, db.Model): self.status = 'submitted' def to_dict(self, backrefs=False, relationships=False): + service_args = json.loads(self.service_args) + if self.service == 'tesseract-ocr' and 'model' in service_args: + tesseract_ocr_pipeline_model = TesseractOCRModel.query.get(service_args['model']) # noqa + service_args['model'] = tesseract_ocr_pipeline_model.title dict_job = { 'id': self.hashid, 'user_id': self.user.hashid, @@ -494,7 +604,7 @@ class Job(HashidMixin, db.Model): 'description': self.description, 'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa 'service': self.service, - 'service_args': self.service_args, + 'service_args': service_args, 'service_version': self.service_version, 'status': self.status, 'title': self.title, @@ -550,7 +660,7 @@ class CorpusFile(FileMixin, HashidMixin, db.Model): @property def path(self): - return os.path.join(self.corpus.path, self.filename) + return os.path.join(self.corpus.path, 'files', str(self.id)) @property def url(self): @@ -659,28 +769,27 @@ class Corpus(HashidMixin, db.Model): return self.user.hashid def build(self): - output_dir = os.path.join(self.path, 'merged') - shutil.rmtree(output_dir, ignore_errors=True) - os.mkdir(output_dir) - output_file = os.path.join(output_dir, 'corpus.vrt') corpus_element = ET.fromstring('<corpus>\n</corpus>') for corpus_file in self.files: element_tree = ET.parse(corpus_file.path) - text_node = element_tree.find('text') - text_node.set('address', corpus_file.address or 'NULL') - text_node.set('author', corpus_file.author) - text_node.set('booktitle', corpus_file.booktitle or 'NULL') - text_node.set('chapter', corpus_file.chapter or 'NULL') - text_node.set('editor', corpus_file.editor or 'NULL') - text_node.set('institution', corpus_file.institution or 'NULL') - text_node.set('journal', corpus_file.journal or 'NULL') - text_node.set('pages', corpus_file.pages or 'NULL') - text_node.set('publisher', corpus_file.publisher or 'NULL') - text_node.set('publishing_year', str(corpus_file.publishing_year)) - text_node.set('school', corpus_file.school or 'NULL') - text_node.set('title', corpus_file.title) - corpus_element.insert(1, text_node) - ET.ElementTree(corpus_element).write(output_file, encoding='utf-8') + text_element = element_tree.getroot() + text_element.set('address', corpus_file.address or 'NULL') + text_element.set('author', corpus_file.author) + text_element.set('booktitle', corpus_file.booktitle or 'NULL') + text_element.set('chapter', corpus_file.chapter or 'NULL') + text_element.set('editor', corpus_file.editor or 'NULL') + text_element.set('institution', corpus_file.institution or 'NULL') + text_element.set('journal', corpus_file.journal or 'NULL') + text_element.set('pages', corpus_file.pages or 'NULL') + text_element.set('publisher', corpus_file.publisher or 'NULL') + text_element.set('publishing_year', str(corpus_file.publishing_year)) # noqa + text_element.set('school', corpus_file.school or 'NULL') + text_element.set('title', corpus_file.title) + corpus_element.insert(1, text_element) + ET.ElementTree(corpus_element).write( + os.path.join(self.path, 'cwb', 'corpus.vrt'), + encoding='utf-8' + ) self.last_edited_date = datetime.utcnow() self.status = 'submitted' @@ -688,6 +797,13 @@ class Corpus(HashidMixin, db.Model): shutil.rmtree(self.path, ignore_errors=True) db.session.delete(self) + def makedirs(self): + os.mkdir(self.path) + os.mkdir(os.path.join(self.path, 'files')) + os.mkdir(os.path.join(self.path, 'cwb')) + os.mkdir(os.path.join(self.path, 'cwb', 'data')) + os.mkdir(os.path.join(self.path, 'cwb', 'registry')) + def to_dict(self, backrefs=False, relationships=False): dict_corpus = { 'id': self.hashid, diff --git a/app/services/__init__.py b/app/services/__init__.py index 5c553e896df9daa489a822542a18195d2d55a761..e41a895df47464871c47a5c2e2d80e872485d17e 100644 --- a/app/services/__init__.py +++ b/app/services/__init__.py @@ -1,77 +1,13 @@ from flask import Blueprint +import os +import yaml -SERVICES = { - 'file-setup': { - 'name': 'File setup', - 'versions': { - 'latest': '1.0.0b', - '1.0.0b': { - 'publishing_data': { - 'date': None, - 'title': 'nopaque File setup service', - 'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/tree/1.0.0b', # noqa - 'version': '1.0.0' - } - } - } - }, - 'nlp': { - 'name': 'Natural Language Processing', - 'versions': { - 'latest': '1.0.0b', - '1.0.0b': { - 'check_encoding': True, - 'models': { - 'de': 'German', - 'en': 'English', - 'it': 'Italian', - 'nl': 'Dutch', - 'pl': 'Polish', - 'zh': 'Chinese' - }, - 'publishing_data': { - 'date': None, - 'title': 'nopaque NLP service', - 'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/tree/1.0.0b', # noqa - 'version': '1.0.0' - } - } - } - }, - 'ocr': { - 'name': 'Optical Character Recognition', - 'versions': { - 'latest': '1.0.0b', - '1.0.0b': { - 'binarization': True, - 'models': { - 'ara': 'Arabic', - 'chi_tra': 'Chinese - Traditional', - 'dan': 'Danish', - 'eng': 'English', - 'enm': 'English, Middle 1100-1500', - 'fra': 'French', - 'frm': 'French, Middle ca. 1400-1600', - 'deu': 'German', - 'frk': 'German Fraktur', - 'ell': 'Greek, Modern (1453-)', - 'ita': 'Italian', - 'por': 'Portuguese', - 'rus': 'Russian', - 'spa': 'Spanish; Castilian', - }, - 'publishing_data': { - 'date': None, - 'title': 'nopaque OCR service', - 'url': 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/tree/1.0.0b', # noqa - 'version': '1.0.0' - } - } - } - } -} +services_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), 'services.yml') +with open(services_file, 'r') as f: + SERVICES = yaml.safe_load(f) bp = Blueprint('services', __name__) -from . import routes +from . import routes # noqa diff --git a/app/services/forms.py b/app/services/forms.py index e77f1db33e3709196536b68d0345ea0ee2e635a7..0bebfb02db3496593c595ebc19922eebe052f144 100644 --- a/app/services/forms.py +++ b/app/services/forms.py @@ -1,3 +1,4 @@ +from app.models import TesseractOCRModel from flask_wtf import FlaskForm from wtforms import (BooleanField, MultipleFileField, SelectField, StringField, SubmitField, ValidationError) @@ -6,85 +7,105 @@ from . import SERVICES class AddJobForm(FlaskForm): - description = StringField('Description', - validators=[DataRequired(), Length(1, 255)]) + description = StringField('Description', validators=[DataRequired(), Length(1, 255)]) # noqa submit = SubmitField() title = StringField('Title', validators=[DataRequired(), Length(1, 32)]) version = SelectField('Version', validators=[DataRequired()]) -class AddNLPJobForm(AddJobForm): - check_encoding = BooleanField('Check encoding') +class AddSpacyNLPJobForm(AddJobForm): + encoding_detection = BooleanField('Encoding detection') files = MultipleFileField('Files', validators=[DataRequired()]) - language = SelectField('Language', choices=[('', 'Choose your option')], - default='', validators=[DataRequired()]) - - def validate_check_encoding(self, field): - if field.data and 'check_encoding' not in SERVICES['nlp']['versions'][self.version.data]: # noqa - raise ValidationError('Check encoding is not available in this version') # noqa + model = SelectField( + 'Model', + choices=[('', 'Choose your option')], + default='', + validators=[DataRequired()] + ) + + def validate_encoding_detection(self, field): + service_info = SERVICES['spacy-nlp']['versions'][self.version.data] + if field.data and 'encoding_detection' not in service_info: + raise ValidationError('Encoding detection is not available') def validate_files(form, field): + valid_extensions = ['.txt'] for file in field.data: - if not file.filename.lower().endswith('.txt'): - raise ValidationError('File does not have an approved ' - 'extension: .txt') + if not file.filename.lower().endswith(tuple(valid_extensions)): + raise ValidationError( + 'File does not have an approved extension: ' + '/'.join(valid_extensions) + ) def __init__(self, *args, **kwargs): - version = kwargs.pop('version', SERVICES['nlp']['versions']['latest']) + version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version']) # noqa super().__init__(*args, **kwargs) - if 'check_encoding' not in SERVICES['nlp']['versions'][version]: - self.check_encoding.render_kw = {'disabled': True} - self.language.choices += [(x, y) for x, y in SERVICES['nlp']['versions'][version]['models'].items()] # noqa - self.version.choices = [(x, x) for x in SERVICES['nlp']['versions'] if x != 'latest'] # noqa + service_info = SERVICES['spacy-nlp']['versions'][version] + if 'check_encoding' not in service_info['methods']: + self.encoding_detection.render_kw = {'disabled': True} + self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa + self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']] # noqa self.version.default = version -class AddOCRJobForm(AddJobForm): - binarization = BooleanField('Binarazation') +class AddTesseractOCRJobForm(AddJobForm): + binarization = BooleanField('Binarization') files = MultipleFileField('Files', validators=[DataRequired()]) - language = SelectField('Language', choices=[('', 'Choose your option')], - default='', validators=[DataRequired()]) + model = SelectField( + 'Model', + choices=[('', 'Choose your option')], + default='', + validators=[DataRequired()] + ) def validate_binarization(self, field): - if field.data and 'binarization' not in SERVICES['ocr']['versions'][self.version.data]: # noqa - raise ValidationError('Binarization is not available in this version') # noqa + service_info = SERVICES['tesseract-ocr']['versions'][self.version.data] + if field.data and 'binarization' not in service_info: + raise ValidationError('Binarization is not available') def validate_files(self, field): + valid_extensions = ['.pdf'] for file in field.data: - if not file.filename.lower().endswith('.pdf'): - raise ValidationError('File does not have an approved ' - 'extension: .pdf') + if not file.filename.lower().endswith(tuple(valid_extensions)): + raise ValidationError( + 'File does not have an approved extension: ' + '/'.join(valid_extensions) + ) def __init__(self, *args, **kwargs): - version = kwargs.pop('version', SERVICES['ocr']['versions']['latest']) + version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version']) # noqa super().__init__(*args, **kwargs) - if 'binarization' not in SERVICES['ocr']['versions'][version]: + service_info = SERVICES['tesseract-ocr']['versions'][version] + if 'binarization' not in service_info['methods']: self.binarization.render_kw = {'disabled': True} - self.language.choices += [(x, y) for x, y in SERVICES['ocr']['versions'][version]['models'].items()] # noqa - self.version.choices = [(x, x) for x in SERVICES['ocr']['versions'] if x != 'latest'] # noqa - self.version.default = version + self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()] # noqa + self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']] # noqa + self.version.data = version + self.version.default = SERVICES['tesseract-ocr']['latest_version'] class AddFileSetupJobForm(AddJobForm): files = MultipleFileField('Files', validators=[DataRequired()]) def validate_files(form, field): + valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif'] for file in field.data: - if not file.filename.lower().endswith(('.jpeg', '.jpg', '.png', - '.tiff', '.tif')): - raise ValidationError('File does not have an approved ' - 'extension: .jpeg | .jpg | .png | .tiff ' - '| .tif') + if not file.filename.lower().endswith(tuple(valid_extensions)): + raise ValidationError( + 'File does not have an approved extension: ' + '/'.join(valid_extensions) + ) def __init__(self, *args, **kwargs): - version = kwargs.pop('version', SERVICES['file-setup']['versions']['latest']) + version = kwargs.pop('version', SERVICES['file-setup']['latest_version']) # noqa super().__init__(*args, **kwargs) - self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions'] if x != 'latest'] # noqa - self.version.default = version + self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']] # noqa + self.version.data = version + self.version.default = SERVICES['file-setup']['latest_version'] AddJobForms = { 'file-setup': AddFileSetupJobForm, - 'ocr': AddOCRJobForm, - 'nlp': AddNLPJobForm + 'tesseract-ocr': AddTesseractOCRJobForm, + 'spacy-nlp': AddSpacyNLPJobForm } diff --git a/app/services/routes.py b/app/services/routes.py index 805ab692fe80d7f5a0b10b99a845cf1b95e61e6d..d430e61e4ab2cef5f1f0715ed956ca2e2cd66ba3 100644 --- a/app/services/routes.py +++ b/app/services/routes.py @@ -1,3 +1,4 @@ +from app import hashids from flask import (abort, current_app, flash, make_response, render_template, request, url_for) from flask_login import current_user, login_required @@ -8,7 +9,6 @@ from .. import db from .forms import AddJobForms from ..models import Job, JobInput import json -import os @bp.route('/corpus-analysis') @@ -24,57 +24,65 @@ def service(service): # Check if the requested service exist if service not in SERVICES or service not in AddJobForms: abort(404) - version = request.args.get( - 'version', SERVICES[service]['versions']['latest']) + version = request.args.get('version', SERVICES[service]['latest_version']) if version not in SERVICES[service]['versions']: abort(404) form = AddJobForms[service](prefix='add-job-form', version=version) - form.version.data = version title = SERVICES[service]['name'] - versions = SERVICES[service]['versions'] if form.is_submitted(): if not form.validate(): return make_response(form.errors, 400) - service_args = [] - if service == 'nlp': - service_args.append(f'-l {form.language.data}') - if form.check_encoding.data: - service_args.append('--check-encoding') - if service == 'ocr': - service_args.append(f'-l {form.language.data}') + service_args = {} + if service == 'spacy-nlp': + service_args['model'] = form.model.data + if form.encoding_detection.data: + service_args['encoding_detection'] = True + if service == 'tesseract-ocr': + service_args['model'] = hashids.decode(form.model.data) if form.binarization.data: - service_args.append('--binarize') - job = Job(user=current_user, - description=form.description.data, - service=service, service_args=json.dumps(service_args), - service_version=form.version.data, - status='preparing', title=form.title.data) + service_args['binarization'] = True + job = Job( + user=current_user, + description=form.description.data, + service=service, + service_args=json.dumps(service_args), + service_version=form.version.data, + status='preparing', + title=form.title.data + ) db.session.add(job) - db.session.flush() + db.session.flush(objects=[job]) db.session.refresh(job) try: - os.makedirs(job.path) - except OSError: - current_app.logger.error(f'Make dir {job.path} led to an OSError!') + job.makedirs() + except OSError as e: + current_app.logger.error(e) db.session.rollback() flash('Internal Server Error', 'error') - return make_response( - {'redirect_url': url_for('.service', service=service)}, 500) - else: - for file in form.files.data: - filename = secure_filename(file.filename) - job_input = JobInput( - filename=filename, job=job, mimetype=file.mimetype) + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + for file in form.files.data: + filename = secure_filename(file.filename) + job_input = JobInput( + filename=filename, + job=job, + mimetype=file.mimetype + ) + db.session.add(job_input) + db.session.flush(objects=[job_input]) + db.session.refresh(job_input) + try: file.save(job_input.path) - db.session.add(job_input) - job.status = 'submitted' - db.session.commit() - flash(f'Job "{job.title}" added', 'job') - return make_response( - {'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job.status = 'submitted' + db.session.commit() + flash(f'Job "{job.title}" added', 'job') + return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa return render_template( f'services/{service.replace("-", "_")}.html.j2', form=form, - title=title, - versions=versions + title=title ) diff --git a/app/services/services.yml b/app/services/services.yml new file mode 100644 index 0000000000000000000000000000000000000000..0c82c3d91c37bc0ae292fba24188ab1462972a96 --- /dev/null +++ b/app/services/services.yml @@ -0,0 +1,38 @@ +# TODO: This could also be done via GitLab/GitHub APIs +#file-setup-pipeline: +file-setup: + name: 'File setup pipeline' + latest_version: '0.1.0' + versions: + 0.1.0: + publisher: 'Bielefeld University - CRC 1288 - INF' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0' +#spacy-nlp-pipeline: +spacy-nlp: + name: 'spaCy NLP' + latest_version: '0.1.0' + versions: + 0.1.0: + methods: + - 'encoding_detection' + models: + de: 'German' + en: 'English' + it: 'Italian' + pl: 'Polish' + zh: 'Chinese' + publisher: 'Bielefeld University - CRC 1288 - INF' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0' +#tesseract-ocr-pipeline: +tesseract-ocr: + name: 'Tesseract OCR' + latest_version: '0.1.0' + versions: + 0.1.0: + methods: + - 'binarization' + publisher: 'Bielefeld University - CRC 1288 - INF' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0' diff --git a/app/static/css/nopaque.css b/app/static/css/nopaque.css index 90f4df68e8f5cfa46c8431424291ae958f850e29..ee5377e123f5f0e1ef825a04e44271f7fd6b6201 100644 --- a/app/static/css/nopaque.css +++ b/app/static/css/nopaque.css @@ -50,8 +50,8 @@ h1 .nopaque-icons, h2 .nopaque-icons, h3 .nopaque-icons, h4 .nopaque-icons, } .nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";} .nopaque-icons.service-icon[data-service="file-setup"]:empty:before {content: "E";} -.nopaque-icons.service-icon[data-service="nlp"]:empty:before {content: "G";} -.nopaque-icons.service-icon[data-service="ocr"]:empty:before {content: "F";} +.nopaque-icons.service-icon[data-service="spacy-nlp"]:empty:before {content: "G";} +.nopaque-icons.service-icon[data-service="tesseract-ocr"]:empty:before {content: "F";} .status-text[data-status]:empty:before {content: attr(data-status);} diff --git a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js index ad324e34d2c8db2fbf59d7514b3672ba67cde5f6..c07ff35d74da28be82babd0037a6b433240c6b84 100644 --- a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js +++ b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js @@ -53,7 +53,7 @@ class CorpusAnalysisApp { this.data.cQiClient = new CQiClient(this.settings.corpusId); this.data.cQiClient.connect() .then(cQiStatus => { - return this.data.cQiClient.corpora.get('CORPUS'); + return this.data.cQiClient.corpora.get(`NOPAQUE_${this.settings.corpusId}`); }) .then( cQiCorpus => { diff --git a/app/static/js/RessourceDisplays/JobDisplay.js b/app/static/js/RessourceDisplays/JobDisplay.js index 61222693112a218c9b4eb013d509c33dff29262e..921029087d89fb71bcbe1b51325727c80f820920 100644 --- a/app/static/js/RessourceDisplays/JobDisplay.js +++ b/app/static/js/RessourceDisplays/JobDisplay.js @@ -100,7 +100,10 @@ class JobDisplay extends RessourceDisplay { } setServiceArgs(serviceArgs) { - this.setElements(this.displayElement.querySelectorAll('.job-service-args'), serviceArgs); + this.setElements( + this.displayElement.querySelectorAll('.job-service-args'), + JSON.stringify(serviceArgs) + ); } setServiceVersion(serviceVersion) { diff --git a/app/static/js/RessourceLists/JobResultList.js b/app/static/js/RessourceLists/JobResultList.js index 56399bcbdfcce357d05a5df971a9bce013b69040..708b25f22504c06552213100083e3f2cfddd0489 100644 --- a/app/static/js/RessourceLists/JobResultList.js +++ b/app/static/js/RessourceLists/JobResultList.js @@ -10,25 +10,10 @@ class JobResultList extends RessourceList { </tr> `.trim(), ressourceMapper: jobResult => { - let description; - - if (jobResult.filename.endsWith('.pdf.zip')) { - description = 'PDF files with text layer'; - } else if (jobResult.filename.endsWith('.txt.zip')) { - description = 'Raw text files'; - } else if (jobResult.filename.endsWith('.vrt.zip')) { - description = 'VRT compliant files including the NLP data'; - } else if (jobResult.filename.endsWith('.xml.zip')) { - description = 'TEI compliant files'; - } else if (jobResult.filename.endsWith('.poco.zip')) { - description = 'HOCR and image files for post correction (PoCo)'; - } else { - description = 'All result files created during this job'; - } return { id: jobResult.id, creationDate: jobResult.creation_date, - description: description, + description: jobResult.description, filename: jobResult.filename }; }, diff --git a/app/templates/_colors.html.j2 b/app/templates/_colors.html.j2 index 84715cbe5268de1e0c788b8aa67b1ea6dfe97bfe..a6ac0ed82a107e3154e6def33bc138ae2d9c6273 100644 --- a/app/templates/_colors.html.j2 +++ b/app/templates/_colors.html.j2 @@ -19,12 +19,12 @@ 'darken': '#a1b300', 'lighten': '#f2f3e1' }, - 'nlp': { + 'spacy-nlp': { 'base': '#98acd2', 'darken': '#0064a3', 'lighten': '#e5e8f5' }, - 'ocr': { + 'tesseract-ocr': { 'base': '#a9d8c8', 'darken': '#00a58b', 'lighten': '#e7f4f1' diff --git a/app/templates/_sidenav.html.j2 b/app/templates/_sidenav.html.j2 index c3ac9ab8f4cd2e9bd3240e6fa085d47eef19e3f9..8729f4f8a81d0dcb52b14a775c5405a521301922 100644 --- a/app/templates/_sidenav.html.j2 +++ b/app/templates/_sidenav.html.j2 @@ -15,8 +15,8 @@ <li><div class="divider"></div></li> <li><a class="subheader">Processes & Services</a></li> <li class="service-color service-color-border border-darken" data-service="file-setup" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='file-setup') }}"><i class="nopaque-icons service-icon" data-service="file-setup"></i>File setup</a></li> - <li class="service-color service-color-border border-darken" data-service="ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='ocr') }}"><i class="nopaque-icons service-icon" data-service="ocr"></i>OCR</a></li> - <li class="service-color service-color-border border-darken" data-service="nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='nlp') }}"><i class="nopaque-icons service-icon" data-service="nlp"></i>NLP</a></li> + <li class="service-color service-color-border border-darken" data-service="tesseract-ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='tesseract-ocr') }}"><i class="nopaque-icons service-icon" data-service="tesseract-ocr"></i>OCR</a></li> + <li class="service-color service-color-border border-darken" data-service="spacy-nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='spacy-nlp') }}"><i class="nopaque-icons service-icon" data-service="spacy-nlp"></i>NLP</a></li> <li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='corpus-analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li> <li><div class="divider"></div></li> <li><a class="subheader">Account</a></li> @@ -28,6 +28,9 @@ {% if current_user.can(Permission.ADMINISTRATE) %} <li><a href="{{ url_for('admin.index') }}"><i class="material-icons">admin_panel_settings</i>Administration</a></li> {% endif %} + {% if current_user.can(Permission.CONTRIBUTE) %} + <li><a href="{{ url_for('contribute.index') }}"><i class="material-icons">new_label</i>Contribute</a></li> + {% endif %} {% if current_user.can(Permission.USE_API) %} <li><a href="{{ url_for('api.doc') }}"><i class="material-icons">api</i>API</a></li> {% endif %} diff --git a/app/templates/main/dashboard.html.j2 b/app/templates/main/dashboard.html.j2 index 1e763c3e8ca364849b1c7c21b9c39320c175fff6..05f5b804fc9cbd18e90de06915ea9bb3193a4be0 100644 --- a/app/templates/main/dashboard.html.j2 +++ b/app/templates/main/dashboard.html.j2 @@ -120,32 +120,32 @@ </a> <br><br> <p class="service-color-text darken" data-service="file-setup"><b>File setup</b></p> - <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p> + <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing.</p> <a href="{{ url_for('services.service', service='file-setup') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup">Create Job</a> </div> </div> <div class="col s12 m4"> <div class="card-panel center-align hoverable"> <br> - <a href="{{ url_for('services.service', service='ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> - <i class="nopaque-icons service-color darken service-icon" data-service="ocr" style="font-size: 2.5rem;"></i> + <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> + <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr" style="font-size: 2.5rem;"></i> </a> <br><br> - <p class="service-color-text darken" data-service="ocr"><b>Optical Character Recognition</b></p> + <p class="service-color-text darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p> <p class="light">nopaque converts your image data – like photos or scans – into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.</p> - <a href="{{ url_for('services.service', service='ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="ocr">Create Job</a> + <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="tesseract-ocr">Create Job</a> </div> </div> <div class="col s12 m4"> <div class="card-panel center-align hoverable"> <br> - <a href="{{ url_for('services.service', service='nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> - <i class="nopaque-icons service-color darken service-icon" data-service="nlp" style="font-size: 2.5rem;"></i> + <a href="{{ url_for('services.service', service='spacy-nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> + <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp" style="font-size: 2.5rem;"></i> </a> <br><br> - <p class="service-color-text darken" data-service="nlp"><b>Natural Language Processing</b></p> + <p class="service-color-text darken" data-service="spacy-nlp"><b>Natural Language Processing</b></p> <p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p> - <a href="{{ url_for('services.service', service='nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="nlp">Create Job</a> + <a href="{{ url_for('services.service', service='spacy-nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="spacy-nlp">Create Job</a> </div> </div> </div> diff --git a/app/templates/main/index.html.j2 b/app/templates/main/index.html.j2 index bbc4428322de18373cf286fe13ddb6bddbcdb15a..0bd343c34b5b090f44742358cc6d830e75f7fa85 100644 --- a/app/templates/main/index.html.j2 +++ b/app/templates/main/index.html.j2 @@ -84,11 +84,11 @@ <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p> </div> <div class="col s12 m6 l3 center-align"> - <a href="{{ url_for('services.service', service='ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> - <i class="nopaque-icons service-color darken service-icon" data-service="ocr"></i> + <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> + <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i> </a> <br><br> - <p class="service-color-text text-darken" data-service="ocr"><b>Optical Character Recognition</b></p> + <p class="service-color-text text-darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p> <p class="light">nopaque converts your image data – like photos or scans – into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.</p> </div> <div class="col s12 m6 l3 center-align"> diff --git a/app/templates/services/nlp.html.j2 b/app/templates/services/spacy_nlp.html.j2 similarity index 93% rename from app/templates/services/nlp.html.j2 rename to app/templates/services/spacy_nlp.html.j2 index d07470e1908e567f66fbc1eb8daff41a38be08a3..30fab84cff34a8a42d8d9bd78e208d4fcc6cca8c 100644 --- a/app/templates/services/nlp.html.j2 +++ b/app/templates/services/spacy_nlp.html.j2 @@ -2,7 +2,7 @@ {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% import "materialize/wtf.html.j2" as wtf %} -{% block main_attribs %} class="service-scheme" data-service="nlp"{% endblock main_attribs %} +{% block main_attribs %} class="service-scheme" data-service="spacy-nlp"{% endblock main_attribs %} {% block page_content %} <div class="container"> @@ -16,13 +16,13 @@ <p class="hide-on-small-only"> </p> <p class="hide-on-small-only"> </p> <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> - <i class="nopaque-icons service-color darken service-icon" data-service="nlp"></i> + <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp"></i> </a> </div> </div> <div class="col s12 m9 pull-m3"> - <div class="card service-color-border border-darken" data-service="nlp" style="border-top: 10px solid;"> + <div class="card service-color-border border-darken" data-service="spacy-nlp" style="border-top: 10px solid;"> <div class="card-content"> <div class="row"> <div class="col s12 m6"> @@ -71,7 +71,7 @@ {{ wtf.render_field(form.files, accept='text/plain', placeholder='Choose your .txt files') }} </div> <div class="col s12 l4"> - {{ wtf.render_field(form.language, material_icon='language') }} + {{ wtf.render_field(form.model, material_icon='language') }} </div> <div class="col s12 l3"> {{ wtf.render_field(form.version, material_icon='apps') }} @@ -80,13 +80,13 @@ <span class="card-title">Preprocessing</span> </div> <div class="col s9"> - <p>{{ form.check_encoding.label.text }}</p> + <p>{{ form.encoding_detection.label.text }}</p> <p class="light">If the input files are not created with the nopaque OCR service or you do not know if your text files are UTF-8 encoded, check this switch. We will try to automatically determine the right encoding for your texts to process them.</p> </div> <div class="col s3 right-align"> <div class="switch"> <label> - {{ form.check_encoding() }} + {{ form.encoding_detection() }} <span class="lever"></span> </label> </div> diff --git a/app/templates/services/ocr.html.j2 b/app/templates/services/tesseract_ocr.html.j2 similarity index 93% rename from app/templates/services/ocr.html.j2 rename to app/templates/services/tesseract_ocr.html.j2 index 9af593b4a51d3e32f6bd48d62a0c508894dd7b0e..6612128105838ffb8dd9ddb7a8e4b248f9a5a121 100644 --- a/app/templates/services/ocr.html.j2 +++ b/app/templates/services/tesseract_ocr.html.j2 @@ -2,7 +2,7 @@ {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% import "materialize/wtf.html.j2" as wtf %} -{% block main_attribs %} class="service-scheme" data-service="ocr"{% endblock main_attribs %} +{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr"{% endblock main_attribs %} {% block page_content %} <div class="container"> @@ -16,13 +16,13 @@ <p class="hide-on-small-only"> </p> <p class="hide-on-small-only"> </p> <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> - <i class="nopaque-icons service-color darken service-icon" data-service="ocr"></i> + <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i> </a> </div> </div> <div class="col s12 m9 pull-m3"> - <div class="card service-color-border border-darken" data-service="ocr" style="border-top: 10px solid;"> + <div class="card service-color-border border-darken" data-service="tesseract-ocr" style="border-top: 10px solid;"> <div class="card-content"> <div class="row"> <div class="col s12"> @@ -50,10 +50,10 @@ {{ wtf.render_field(form.description, data_length='255', material_icon='description') }} </div> <div class="col s12 l5"> - {{ wtf.render_field(form.files, accept='application/pdf', color=ocr_color_darken, placeholder='Choose your .pdf files') }} + {{ wtf.render_field(form.files, accept='application/pdf', placeholder='Choose your .pdf files') }} </div> <div class="col s12 l4"> - {{ wtf.render_field(form.language, material_icon='language') }} + {{ wtf.render_field(form.model, material_icon='language') }} </div> <div class="col s12 l3"> {{ wtf.render_field(form.version, material_icon='apps') }} @@ -127,7 +127,7 @@ </div> </div> <div class="card-action right-align"> - {{ wtf.render_field(form.submit, color=ocr_color_darken, material_icon='send') }} + {{ wtf.render_field(form.submit, material_icon='send') }} </div> </form> </div> diff --git a/app/utils.py b/app/utils.py deleted file mode 100644 index 75d38b7c8971bafbf58f41675ef412a63a7886b3..0000000000000000000000000000000000000000 --- a/app/utils.py +++ /dev/null @@ -1,10 +0,0 @@ -from app import hashids -from werkzeug.routing import BaseConverter - - -class HashidConverter(BaseConverter): - def to_python(self, value: str) -> int: - return hashids.decode(value)[0] - - def to_url(self, value: int) -> str: - return hashids.encode(value) diff --git a/docker-compose.traefik.yml b/docker-compose.traefik.yml index c261b2d2b41b294e33e440247bb5806167020143..c7d01575bcfc850951953024b8d2031daddc3728 100644 --- a/docker-compose.traefik.yml +++ b/docker-compose.traefik.yml @@ -5,14 +5,14 @@ version: "3.5" networks: - reverse-proxy: - external: - name: reverse-proxy + traefik: + external: true + name: "traefik" services: nopaque: labels: - - "traefik.docker.network=reverse-proxy" + - "traefik.docker.network=traefik" - "traefik.enable=true" ### <http> ### - "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http" diff --git a/migrations/versions/ad0d835fe5b1_.py b/migrations/versions/ad0d835fe5b1_.py new file mode 100644 index 0000000000000000000000000000000000000000..0248e316ccf0ce18b3e1b18bc5c058f2e5dc0c68 --- /dev/null +++ b/migrations/versions/ad0d835fe5b1_.py @@ -0,0 +1,45 @@ +"""empty message + +Revision ID: ad0d835fe5b1 +Revises: 68ed092ffe5e +Create Date: 2022-01-18 16:23:45.673993 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'ad0d835fe5b1' +down_revision = '68ed092ffe5e' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('tesseract_ocr_models', + sa.Column('creation_date', sa.DateTime(), nullable=True), + sa.Column('filename', sa.String(length=255), nullable=True), + sa.Column('last_edited_date', sa.DateTime(), nullable=True), + sa.Column('mimetype', sa.String(length=255), nullable=True), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('user_id', sa.Integer(), nullable=True), + sa.Column('compatible_service_versions', sa.String(length=255), nullable=True), + sa.Column('description', sa.String(length=255), nullable=True), + sa.Column('publisher', sa.String(length=128), nullable=True), + sa.Column('publishing_year', sa.Integer(), nullable=True), + sa.Column('title', sa.String(length=64), nullable=True), + sa.Column('version', sa.String(length=16), nullable=True), + sa.ForeignKeyConstraint(['user_id'], ['users.id'], ), + sa.PrimaryKeyConstraint('id') + ) + op.add_column('job_results', sa.Column('description', sa.String(length=255), nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('job_results', 'description') + op.drop_table('tesseract_ocr_models') + # ### end Alembic commands ### diff --git a/nopaque.py b/nopaque.py index 0045d02b28bf768d064020749d283a11264061c9..ab8db5a6febb7b2257794b73990911c6529ce926 100644 --- a/nopaque.py +++ b/nopaque.py @@ -3,10 +3,9 @@ import eventlet eventlet.monkey_patch() - -from app import db, cli, create_app # noqa from app.models import (Corpus, CorpusFile, Job, JobInput, JobResult, - Permission, QueryResult, Role, User) # noqa + Permission, QueryResult, Role, TesseractOCRModel, User) # noqa +from app import db, cli, create_app # noqa from flask import Flask # noqa from typing import Any, Dict # noqa @@ -34,5 +33,6 @@ def make_shell_context() -> Dict[str, Any]: 'Permission': Permission, 'QueryResult': QueryResult, 'Role': Role, + 'TesseractOCRModel': TesseractOCRModel, 'User': User } diff --git a/requirements.txt b/requirements.txt index 202121fd3e8382ef240fed4ccaf78180895e3f9e..52770c57c788a37ebfd80c9547504de7328a2ff1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,5 +19,7 @@ hiredis jsonschema psycopg2 python-dotenv +pyyaml redis +tqdm wtforms[email]