diff --git a/app/corpora/json_routes.py b/app/corpora/json_routes.py index b6ef0110f41afcfe9558c3122fc50b79240e7454..6784bb9ee3e968b36c1f3e4cb72c84fcc8bfb216 100644 --- a/app/corpora/json_routes.py +++ b/app/corpora/json_routes.py @@ -61,12 +61,6 @@ def build_corpus(corpus_id): @bp.route('/stopwords') @content_negotiation(produces='application/json') def get_stopwords(): - # data = request.json - # if not isinstance(data, dict): - # abort(400) - # language = data.get('language') - # if not isinstance(language, str): - # abort(400) nltk.download('stopwords') languages = ["german", "english", "catalan", "greek", "spanish", "french", "italian", "russian", "chinese"] stopwords = {} @@ -74,10 +68,7 @@ def get_stopwords(): stopwords[language] = nltk.corpus.stopwords.words(language) stopwords['punctuation'] = list(punctuation) + ['—', '|'] stopwords['user_stopwords'] = [] - print(stopwords) - response_data = { - 'stopwords': stopwords - } + response_data = stopwords return response_data, 202 # @bp.route('/<hashid:corpus_id>/generate-share-link', methods=['POST']) diff --git a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js index 8af82e8d32213d4a4b44cc17f5163ed4e08ec0f1..a30af989d6bbfd7ff716f922d8dee84dc6dfad81 100644 --- a/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js +++ b/app/static/js/CorpusAnalysis/CorpusAnalysisApp.js @@ -1,7 +1,10 @@ class CorpusAnalysisApp { constructor(corpusId) { this.data = { - promises: {getStopwords: []} + stopwords: undefined, + originalStopwords: {}, + stopwordCache: {}, + promises: {getStopwords: undefined} }; // HTML elements @@ -24,39 +27,17 @@ class CorpusAnalysisApp { }; } - // getStopwords(language) { - // if (language in this.data.promises.getStopwords) { - // console.log('Stopwords already loaded'); - // return this.data.promises.getStopwords[language]; - // } - // this.data.promises.getStopwords[language] = new Promise((resolve, reject) => { - // Requests.corpora.entity.getStopwords(language) - // .then((response) => { - // response.json() - // .then((json) => { - // let stopwords = json.stopwords; - // resolve(stopwords); - // }) - // .catch((error) => { - // reject(error); - // }); - // }); - // }); - // return this.data.promises.getStopwords[language]; - // } - getStopwords() { - if (this.data.promises.getStopwords.length !== 0) { - console.log('Stopwords already loaded'); - return this.data.promises.getStopwords; - } this.data.promises.getStopwords = new Promise((resolve, reject) => { Requests.corpora.entity.getStopwords() .then((response) => { response.json() .then((json) => { - let stopwords = json.stopwords; - resolve(stopwords); + for (let [key, value] of Object.entries(json)) { + this.data.originalStopwords[key] = value; + } + this.data.stopwords = json; + resolve(this.data.stopwords); }) .catch((error) => { reject(error); @@ -66,7 +47,6 @@ class CorpusAnalysisApp { return this.data.promises.getStopwords; } - init() { this.disableActionElements(); this.elements.m.initModal.open(); @@ -79,20 +59,11 @@ class CorpusAnalysisApp { .then((cqiCorpora) => { this.data.corpus = {o: cqiCorpora[0]}; console.log(this.data.corpus.o.staticData); - this.renderGeneralCorpusInfo(this.data.corpus.o.staticData); - this.renderTextInfoList(this.data.corpus.o.staticData); - this.renderTextProportionsGraphic(this.data.corpus.o.staticData); - this.renderFrequenciesGraphic(this.data.corpus.o.staticData); - this.renderBoundsGraphic(this.data.corpus.o.staticData); - // this.data.corpus.o.getCorpusData() - // .then(corpusData => { - // console.log(corpusData); - // this.renderGeneralCorpusInfo(corpusData); - // this.renderTextInfoList(corpusData); - // this.renderTextProportionsGraphic(corpusData); - // this.renderFrequenciesGraphic(corpusData); - // this.renderBoundsGraphic(corpusData); - // }); + this.renderGeneralCorpusInfo(); + this.renderTextInfoList(); + this.renderTextProportionsGraphic() + this.renderFrequenciesGraphic(); + this.renderBoundsGraphic(); // TODO: Don't do this hgere this.data.corpus.o.updateDb(); this.enableActionElements(); @@ -117,6 +88,30 @@ class CorpusAnalysisApp { this.elements.m.extensionTabs.select(extensionSelectorElement.dataset.target); }); } + + let frequenciesStopwordSettingModal = document.querySelector('#frequencies-stopwords-setting-modal'); + let frequenciesStopwordSettingModalButton = document.querySelector('#frequencies-stopwords-setting-modal-button'); + frequenciesStopwordSettingModalButton.addEventListener('click', () => { + this.data.stopwordCache = {}; + const stopwordsCopy = Object.assign({}, this.data.stopwords); + for (let [key, value] of Object.entries(stopwordsCopy)) { + this.data.stopwordCache[key] = value; + } + this.renderStopwordSettingsModal(this.data.stopwords); + M.Modal.init(frequenciesStopwordSettingModal, {dismissible: false}); + }); + + for (let actionButton of document.querySelectorAll('.frequencies-stopword-setting-modal-action-buttons')) { + actionButton.addEventListener('click', (event) => { + let action = event.target.closest('.frequencies-stopword-setting-modal-action-buttons').dataset.action; + if (action === 'submit') { + this.renderFrequenciesGraphic(); + } else if (action === 'cancel') { + this.data.stopwords = this.data.stopwordCache; + this.renderFrequenciesGraphic(); + } + }); + } } registerExtension(extension) { @@ -154,7 +149,8 @@ class CorpusAnalysisApp { } } - renderGeneralCorpusInfo(corpusData) { + renderGeneralCorpusInfo() { + let corpusData = this.data.corpus.o.staticData; document.querySelector('.corpus-num-tokens').innerHTML = corpusData.corpus.counts.token; document.querySelector('.corpus-num-s').innerHTML = corpusData.corpus.counts.s; document.querySelector('.corpus-num-unique-words').innerHTML = Object.entries(corpusData.corpus.freqs.word).length; @@ -163,7 +159,8 @@ class CorpusAnalysisApp { document.querySelector('.corpus-num-unique-simple-pos').innerHTML = Object.entries(corpusData.corpus.freqs.simple_pos).length; } - renderTextInfoList(corpusData) { + renderTextInfoList() { + let corpusData = this.data.corpus.o.staticData; let corpusTextInfoListElement = document.querySelector('.corpus-text-info-list'); let corpusTextInfoList = new CorpusTextInfoList(corpusTextInfoListElement); let texts = corpusData.s_attrs.text.lexicon; @@ -189,7 +186,8 @@ class CorpusAnalysisApp { textCountChipElement.innerHTML = `Text count: ${corpusData.corpus.counts.text}`; } - renderTextProportionsGraphic(corpusData) { + renderTextProportionsGraphic() { + let corpusData = this.data.corpus.o.staticData; let textProportionsGraphicElement = document.querySelector('#text-proportions-graphic'); let texts = Object.entries(corpusData.s_attrs.text.lexicon); let graphData = [ @@ -223,7 +221,8 @@ class CorpusAnalysisApp { Plotly.newPlot(textProportionsGraphicElement, graphData, graphLayout, config); } - renderFrequenciesGraphic(corpusData) { + async renderFrequenciesGraphic() { + let corpusData = this.data.corpus.o.staticData; let frequenciesTokenCategoryDropdownElement = document.querySelector('[data-target="frequencies-token-category-dropdown"]'); let frequenciesTokenCategoryDropdownListElement = document.querySelector("#frequencies-token-category-dropdown"); let frequenciesGraphicElement = document.querySelector('#frequencies-graphic'); @@ -248,123 +247,220 @@ class CorpusAnalysisApp { let tokenCategory = frequenciesTokenCategoryDropdownElement.firstChild.textContent.toLowerCase(); - this.createFrequenciesGraphData(tokenCategory, texts, corpusData, graphtype) - .then(graphData => { - let graphLayout = { - barmode: graphtype === 'bar' ? 'stack' : '', - margin: { - t: 20, - l: 50 - }, - yaxis: { - showticklabels: graphtype === 'markers' ? false : true - }, + let graphData = await this.createFrequenciesGraphData(tokenCategory, texts, corpusData, graphtype); + let graphLayout = { + barmode: graphtype === 'bar' ? 'stack' : '', + margin: { + t: 20, + l: 50 + }, + yaxis: { + showticklabels: graphtype === 'markers' ? false : true + }, + }; + let config = { + responsive: true, + modeBarButtonsToRemove: ['zoom2d', 'select2d', 'lasso2d', 'zoomIn2d', 'zoomOut2d', 'autoScale2d', 'resetScale2d'], + displaylogo: false + }; + Plotly.newPlot(frequenciesGraphicElement, graphData, graphLayout, config); + } + + async createFrequenciesGraphData(category, texts, corpusData, graphtype) { + let stopwords = this.data.stopwords; + if (this.data.stopwords === undefined) { + stopwords = await this.getStopwords(); + } + let stopwordList = []; + Object.values(stopwords).forEach(stopwordItems => { + stopwordItems.forEach(stopword => { + stopwordList.push(stopword); + }); + }); + + let graphData = []; + let filteredData = Object.entries(corpusData.corpus.freqs[category]) + .sort((a, b) => b[1] - a[1]) + .filter(item => !stopwordList.includes(corpusData.values.p_attrs[category][item[0]].toLowerCase())) + .slice(0, 5); + + if (graphtype !== 'markers') { + for (let item of filteredData) { + let data = { + x: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), + y: texts.map(text => text[1].freqs[category][item[0]] || 0), + name: corpusData.values.p_attrs[category][item[0]], + type: graphtype }; - let config = { - responsive: true, - modeBarButtonsToRemove: ['zoom2d', 'select2d', 'lasso2d', 'zoomIn2d', 'zoomOut2d', 'autoScale2d', 'resetScale2d'], - displaylogo: false + graphData.push(data); + } + } else { + for (let item of filteredData) { + let size = texts.map(text => text[1].freqs[category][item[0]] || 0); + let data = { + x: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), + y: texts.map(text => corpusData.values.p_attrs[category][item[0]]), + name: corpusData.values.p_attrs[category][item[0]], + text: texts.map(text => `${corpusData.values.p_attrs[category][item[0]]}<br>${text[1].freqs[category][item[0]] || 0}`), + mode: 'markers', + marker: { + size: size, + sizeref: 0.4 + } }; - Plotly.newPlot(frequenciesGraphicElement, graphData, graphLayout, config); - }); + graphData.push(data); + } + } + return graphData; } - createFrequenciesGraphData(category, texts, corpusData, graphtype) { - return new Promise((resolve, reject) => { - this.getStopwords() - .then(stopwords => { - this.renderStopwordSettingsModal(stopwords); - let stopwordList = []; - Object.values(stopwords).forEach(stopwordItems => { - stopwordItems.forEach(stopword => { - stopwordList.push(stopword); - }); - }); - let graphData = []; - let filteredData = Object.entries(corpusData.corpus.freqs[category]) - .sort((a, b) => b[1] - a[1]) - .filter(item => !stopwordList.includes(corpusData.values.p_attrs[category][item[0]].toLowerCase())) - .slice(0, 5); - if (graphtype !== 'markers') { - for (let item of filteredData) { - let data = { - x: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), - y: texts.map(text => text[1].freqs[category][item[0]] || 0), - name: corpusData.values.p_attrs[category][item[0]], - type: graphtype - }; - graphData.push(data); - } - } else { - for (let item of filteredData) { - let size = texts.map(text => text[1].freqs[category][item[0]] || 0); - let data = { - x: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), - y: texts.map(text => corpusData.values.p_attrs[category][item[0]]), - name: corpusData.values.p_attrs[category][item[0]], - text: texts.map(text => `${corpusData.values.p_attrs[category][item[0]]}<br>${text[1].freqs[category][item[0]] || 0}`), - mode: 'markers', - marker: { - size: size, - // sizeref: 2.0 * Math.max(...size) / (80**2), - // sizemode: 'area', - sizeref: 0.2 - } - }; - graphData.push(data); - } - } - resolve(graphData); - }) - .catch(error => { - reject(error); + renderStopwordSettingsModal(stopwords) { + let stopwordInputField = document.querySelector('#stopword-input-field'); + let userStopwordListContainer = document.querySelector('#user-stopword-list-container'); + let stopwordLanguageSelection = document.querySelector('#stopword-language-selection'); + let stopwordLanguageChipList = document.querySelector('#stopword-language-chip-list'); + let deleteLanguageStopwordListEntriesButton = document.querySelector('#delete-language-stopword-list-entries-button'); + let resetLanguageStopwordListEntriesButton = document.querySelector('#reset-language-stopword-list-entries-button'); + + stopwordLanguageChipList.innerHTML = ''; + userStopwordListContainer.innerHTML = ''; + + // Render stopword language selection. Set english as default language. Filter out user_stopwords. + for (let language of Object.keys(stopwords)) { + if (language !== 'user_stopwords') { + if (language === 'english') { + let optionElement = Utils.HTMLToElement(`<option value="${language}" selected>${language}</option>`); + stopwordLanguageSelection.appendChild(optionElement); + } else { + let optionElement = Utils.HTMLToElement(`<option value="${language}">${language}</option>`); + stopwordLanguageSelection.appendChild(optionElement); + } + } + } + + // Render user stopwords over input field. + if (this.data.stopwords['user_stopwords'].length > 0) { + for (let word of this.data.stopwords['user_stopwords']) { + let chipElement = Utils.HTMLToElement(`<div class="chip">${word}<i class="close material-icons">close</i></div>`); + chipElement.addEventListener('click', (event) => { + let removedListItem = event.target.closest('.chip').firstChild.textContent; + this.data.stopwords['user_stopwords'] = this.data.stopwords['user_stopwords'].filter(item => item !== removedListItem); }); + userStopwordListContainer.appendChild(chipElement); + } + } + + // Render english stopwords as default ... + this.renderStopwordLanguageChipList('english', stopwords['english']); + + // ... or render selected language stopwords. + stopwordLanguageSelection.addEventListener('change', (event) => { + this.renderStopwordLanguageChipList(event.target.value, stopwords[event.target.value]); + }); + + // Eventlistener for deleting all stopwords of a language. + deleteLanguageStopwordListEntriesButton.addEventListener('click', (event) => { + let selectedLanguage = stopwordLanguageSelection.value; + this.data.stopwords[selectedLanguage] = []; + stopwordLanguageChipList.innerHTML = ''; + this.buttonRendering(); + }); + + // Eventlistener for resetting all stopwords of a language to the original stopwords. + resetLanguageStopwordListEntriesButton.addEventListener('click', () => { + let selectedLanguage = stopwordLanguageSelection.value; + this.data.stopwords[selectedLanguage] = this.data.originalStopwords[selectedLanguage]; + this.renderStopwordLanguageChipList(selectedLanguage, this.data.stopwords[selectedLanguage]); }); + + // Initialize Materialize components. + M.Chips.init( + stopwordInputField, + { + placeholder: 'Add stopwords', + onChipAdd: (event) => { + let userStopwords = []; + for (let word of event[0].M_Chips.chipsData) { + if (!this.data.stopwords['user_stopwords'].includes(word.tag.toLowerCase())) { + userStopwords.push(word.tag.toLowerCase()); + } + } + this.data.stopwords['user_stopwords'] = this.data.stopwords['user_stopwords'].concat(userStopwords); + } + } + ); + M.FormSelect.init(stopwordLanguageSelection); + } - renderStopwordSettingsModal(stopwords) { - let stopwordInputField = document.querySelector('.stopword-input-field'); + buttonRendering() { + let stopwordLanguageSelection = document.querySelector('#stopword-language-selection'); + let deleteLanguageStopwordListEntriesButton = document.querySelector('#delete-language-stopword-list-entries-button'); + let resetLanguageStopwordListEntriesButton = document.querySelector('#reset-language-stopword-list-entries-button'); + + let selectedLanguage = stopwordLanguageSelection.value; + let stopwordLength = this.data.stopwords[selectedLanguage].length; + let originalStopwordListLength = this.data.originalStopwords[selectedLanguage].length; + + resetLanguageStopwordListEntriesButton.classList.toggle('blue', stopwordLength !== originalStopwordListLength); + deleteLanguageStopwordListEntriesButton.classList.toggle('red', stopwordLength > 0); + resetLanguageStopwordListEntriesButton.style.cursor = stopwordLength !== originalStopwordListLength ? 'pointer' : 'default'; + deleteLanguageStopwordListEntriesButton.style.cursor = stopwordLength > 0 ? 'pointer' : 'default'; } + renderStopwordLanguageChipList(language, stopwords) { + let stopwordLanguageChipList = document.querySelector('#stopword-language-chip-list'); + stopwordLanguageChipList.innerHTML = ''; + for (let word of stopwords) { + let chipElement = Utils.HTMLToElement(`<div class="chip">${word}<i class="close material-icons">close</i></div>`); + chipElement.addEventListener('click', (event) => { + let removedListItem = event.target.closest('.chip').firstChild.textContent; + this.data.stopwords[language] = this.data.stopwords[language].filter(item => item !== removedListItem); + this.buttonRendering(); + }); + stopwordLanguageChipList.appendChild(chipElement); + } + this.buttonRendering(); + } + renderBoundsGraphic() { + let corpusData = this.data.corpus.o.staticData; + let boundsGraphicElement = document.querySelector('#bounds-graphic'); - renderBoundsGraphic(corpusData) { - let boundsGraphicElement = document.querySelector('#bounds-graphic'); - - let graphData = []; - let texts = Object.entries(corpusData.s_attrs.text.lexicon); - - graphData = [{ - type: 'bar', - x: texts.map(text => text[1].bounds[1] - text[1].bounds[0]), - y: texts.map(text => corpusData.values.s_attrs.text[text[0]].title), - base: texts.map(text => text[1].bounds[0]), - text: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), - orientation: 'h', - hovertemplate: '%{base} - %{x} <br>%{y}', - showlegend: false - }]; - - let graphLayout = { - barmode: 'stack', - type: 'bar', - showgrid: false, - xaxis: { - rangemode: 'nonnegative', - autorange: true - }, - yaxis: { - autorange: true, - showticklabels: false - } - }; + let graphData = []; + let texts = Object.entries(corpusData.s_attrs.text.lexicon); - let config = { - responsive: true, - modeBarButtonsToRemove: ['zoom2d', 'select2d', 'lasso2d', 'zoomIn2d', 'zoomOut2d', 'autoScale2d', 'resetScale2d'], - displaylogo: false - }; - - Plotly.newPlot(boundsGraphicElement, graphData, graphLayout, config); + graphData = [{ + type: 'bar', + x: texts.map(text => text[1].bounds[1] - text[1].bounds[0]), + y: texts.map(text => corpusData.values.s_attrs.text[text[0]].title), + base: texts.map(text => text[1].bounds[0]), + text: texts.map(text => `${corpusData.values.s_attrs.text[text[0]].title} (${corpusData.values.s_attrs.text[text[0]].publishing_year})`), + orientation: 'h', + hovertemplate: '%{base} - %{x} <br>%{y}', + showlegend: false + }]; + + let graphLayout = { + barmode: 'stack', + type: 'bar', + showgrid: false, + xaxis: { + rangemode: 'nonnegative', + autorange: true + }, + yaxis: { + autorange: true, + showticklabels: false + } + }; + + let config = { + responsive: true, + modeBarButtonsToRemove: ['zoom2d', 'select2d', 'lasso2d', 'zoomIn2d', 'zoomOut2d', 'autoScale2d', 'resetScale2d'], + displaylogo: false + }; + + Plotly.newPlot(boundsGraphicElement, graphData, graphLayout, config); } } diff --git a/app/templates/corpora/analysis.html.j2 b/app/templates/corpora/analysis.html.j2 index b9a80c974ab16e3d821ae2d73cbb560c65e47da4..2bdb35f23c1417ddb2b5a0c4c815777aeab750d5 100644 --- a/app/templates/corpora/analysis.html.j2 +++ b/app/templates/corpora/analysis.html.j2 @@ -123,7 +123,7 @@ <a class="btn disabled frequencies-graph-mode-button" data-graph-type="bar"><i class="material-icons">equalizer</i></a> <a class="btn frequencies-graph-mode-button" data-graph-type="scatter"><i class="material-icons">show_chart</i></a> <a class="btn frequencies-graph-mode-button" data-graph-type="markers"><i class="material-icons">bubble_chart</i></a> - <a class="btn-flat modal-trigger" href="#frequencies-stopwords-setting-modal"><i class="material-icons grey-text text-darken-2">settings</i></a> + <a class="btn-flat modal-trigger no-autoinit" id="frequencies-stopwords-setting-modal-button" href="#frequencies-stopwords-setting-modal"><i class="material-icons grey-text text-darken-2">settings</i></a> </div> </div> </div> @@ -140,7 +140,6 @@ </div> </div> </div> - </div> @@ -166,18 +165,35 @@ </div> </div> -<div class="modal" id="frequencies-stopwords-setting-modal"> +<div class="modal modal-fixed-footer" id="frequencies-stopwords-setting-modal"> <div class="modal-content"> <h4>Settings</h4> - <p>Here you can change the stopword-lists. Add your own stopwords or change the already existing below.</p> - <div class="chips chips-placeholder stopword-input-field"></div> <div class="row"> + <p>Here you can change the stopword-lists. Stopwords are common words in a language, + like "the" or "and," that carry little meaning and are often removed in text analysis + to improve efficiency and accuracy.</p> + <div id="user-stopword-list-container"></div> + <div class="chips col s8 no-autoinit input-field" id="stopword-input-field"> + </div> + </div> + <div class="row"> + <p>Below you can find a list of all stopwords that are always filtered out. + The lists are sorted by language, you can remove single words or remove + whole languages via the settings on the right.</p> <div class="input-field col s3"> - <select class="stopword-language-selection"></select> + <select id="stopword-language-selection"></select> <label>Stopword language select</label> </div> </div> + <div class="row"> + <div class="chip white-text" id="delete-language-stopword-list-entries-button" style="cursor:pointer">Delete all below<i class="material-icons right" style="margin-top: 4px; margin-left: -1px;">delete</i></div> + <div class="chip white-text" id="reset-language-stopword-list-entries-button" style="cursor:pointer">Reset stopword list<i class="material-icons right disable-on-click" style="margin-top: 4px; margin-left: -1px;">refresh</i></div> + </div> + <div id="stopword-language-chip-list"></div> </div> + <div class="modal-footer"> + <a class="modal-close waves-effect waves-green btn frequencies-stopword-setting-modal-action-buttons" data-action="cancel">Cancel</a> + <a class="modal-close waves-effect waves-green btn frequencies-stopword-setting-modal-action-buttons" data-action="submit">Submit</a> </div>