Commit 7f996d34 authored by Long Nguyen's avatar Long Nguyen
Browse files

Move cleaned code to public repo

parents
.Rproj.user
.Rhistory
.Rdata
.httr-oauth
.DS_Store
This diff is collapsed.
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Dictionaries = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1"
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
LibPQ = "194296ae-ab2e-5f79-8cd4-7183a0a5a0d1"
Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
ThreadsX = "ac1d9e8a-700a-412c-b207-f0111f4b6c0d"
WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
# Geocoding German Twitter
Code to reproduce geocoding and analysis in Nguyen et al. (2022) *Efficient and reliable geocoding of German Twitter data to enable spatial data linkage to official statistics and other data sources*
---
version: "3"
services:
nominatim:
container_name: nominatim
image: mediagis/nominatim:4.0
restart: always
ports:
- "8080:8080"
environment:
# see https://github.com/mediagis/nominatim-docker/tree/master/4.0#configuration for more options
# PBF_URL: https://planet.openstreetmap.org/pbf/planet-latest.osm.pbf
PBF_PATH: /nominatim/data.osm.pbf
# https://wiki.openstreetmap.org/wiki/Planet.osm/diffs
REPLICATION_URL: https://planet.openstreetmap.org/replication/day/
IMPORT_WIKIPEDIA: 'true'
IMPORT_US_POSTCODES: 'true'
IMPORT_GB_POSTCODES: 'true'
THREADS: 64
NOMINATIM_PASSWORD: "${NOMINATIM_PLANET_PASSWORD}"
volumes:
- /mnt/nvmedirect/nominatim-world/postgresql:/var/lib/postgresql/12/main
- /mnt/nvmedirect/nominatim-world/data:/nominatim/
shm_size: 16gb
nominatim-de:
container_name: nominatim-de
image: mediagis/nominatim:4.0
restart: always
ports:
- "9080:8080"
environment:
PBF_URL: https://download.geofabrik.de/europe/germany-latest.osm.pbf
REPLICATION_URL: https://download.geofabrik.de/europe/germany-updates/
IMPORT_WIKIPEDIA: 'true'
IMPORT_STYLE: 'extratags'
THREADS: 64
NOMINATIM_PASSWORD: "${NOMINATIM_DE_PASSWORD}"
volumes:
- /mnt/nvmedirect/nominatim-de/postgresql:/var/lib/postgresql/12/main
- /mnt/nvmedirect/nominatim-de/data:/nominatim/
shm_size: 16gb
library(dplyr)
library(DBI)
twitter_db <- dbConnect(
RPostgres::Postgres(),
dbname = "twitter",
host = Sys.getenv("TWITTER_DB_HOST"),
port = Sys.getenv("TWITTER_DB_PORT"),
user = Sys.getenv("TWITTER_DB_USER")
)
dbExecute(twitter_db, "
CREATE MATERIALIZED VIEW tweets_geo_3y_nrt_nv_new AS
SELECT t.tweet_id,
p.user_id,
t.text,
t.post_date,
t.coordinates_latitude as lat,
t.coordinates_longitude as long,
g.nuts_1 AS tw_nuts_1,
g.nuts_2 AS tw_nuts_2,
g.nuts_3 AS tw_nuts_3,
p.location,
l.nuts_1 AS bi_nuts_1,
l.nuts_2 AS bi_nuts_2,
l.nuts_3 AS bi_nuts_3
FROM tweets t
LEFT JOIN (
SELECT tweet_id, nuts_1, nuts_2, nuts_3
FROM profiles_geocoded
) l ON t.tweet_id = l.tweet_id
LEFT JOIN (
SELECT tweet_id, nuts_1, nuts_2, nuts_3
FROM tweets_geotagged_nuts
) g ON t.tweet_id = g.tweet_id
INNER JOIN profiles p ON t.tweet_id = p.tweet_id
WHERE t.post_date >= '2018-10-15'::date AND t.post_date <= '2021-10-14'::date
AND t.isretweet = FALSE
AND p.verified = FALSE;
")
dbExecute(twitter_db, "
CREATE UNIQUE INDEX idx_tweets_geo_3y_nrt_nv_tweetid
ON tweets_geo_3y_nrt_nv_new (tweet_id);
")
dbExecute(twitter_db, "
CREATE INDEX idx_tweets_geo_3y_nrt_nv_userid
ON tweets_geo_3y_nrt_nv_new (user_id);
")
dbExecute(twitter_db, "
CREATE INDEX idx_tweets_geo_3y_nrt_nv_twnutsid
ON tweets_geo_3y_nrt_nv_new (tw_nuts_3, long, lat);
")
dbExecute(twitter_db, "
CREATE INDEX idx_tweets_geo_3y_nrt_nv_binutsid
ON tweets_geo_3y_nrt_nv_new (bi_nuts_1, bi_nuts_2, bi_nuts_3);
")
dbExecute(twitter_db, "
CREATE MATERIALIZED VIEW users_twgeo_3y_nrt_nv AS
SELECT user_id, tw_nuts_3
FROM (
SELECT *,
row_number() OVER (PARTITION BY user_id ORDER BY n DESC) AS r
FROM (
SELECT user_id, tw_nuts_3, count(*) AS n
FROM tweets_geo_3y_nrt_nv_new
WHERE tw_nuts_3 IS NOT NULL
GROUP BY user_id, tw_nuts_3
) q01
) q02
WHERE (r <= 1);
")
dbExecute(twitter_db, "
CREATE MATERIALIZED VIEW users_bigeo_3y_nrt_nv AS
SELECT user_id, bi_nuts_1, bi_nuts_2, bi_nuts_3
FROM (
SELECT *,
row_number() OVER (PARTITION BY user_id ORDER BY n DESC) AS r
FROM (
SELECT user_id, bi_nuts_1, bi_nuts_2, bi_nuts_3, count(*) AS n
FROM tweets_geo_3y_nrt_nv_new
WHERE bi_nuts_1 IS NOT NULL
GROUP BY user_id, bi_nuts_1, bi_nuts_2, bi_nuts_3
) q01
) q02
WHERE (r <= 1);
")
dbDisconnect(twitter_db)
library(purrr)
library(dplyr)
library(DBI)
library(sf)
# Utils -------------------------------------------------------------------
avg_geotag <- function(tbl_sql) {
tbl_sql |>
left_join(
tbl(twitter_db, "tweets_geo_3y_nrt_nv_new") |>
filter(!is.na(tw_nuts_3)) |>
distinct(user_id, tw_nuts_3, long, lat),
by = c("user_id", "tw_nuts_3")
) |>
group_by(user_id, tw_nuts_3) |>
summarise(across(c(long, lat), mean), .groups = "drop")
}
add_nuts1_nuts2 <- function(df) {
df |>
mutate(
tw_nuts_1 = substr(tw_nuts_3, 1, 3),
tw_nuts_2 = substr(tw_nuts_3, 1, 4),
.before = tw_nuts_3
)
}
notacc_expr <- quote(bi_nuts_3 != tw_nuts_3 | bi_nuts_2 != tw_nuts_2 | bi_nuts_1 != tw_nuts_1)
acc_all <- function(df) {
df |>
summarise(
n = n(),
n_miss = sum(eval(notacc_expr), na.rm = TRUE)
) |>
transmute(acc = 1 - (n_miss / n)) |>
pull()
}
acc_level <- function(df, level) {
vars <- paste0(c("bi_nuts_", "tw_nuts_"), level)
n <- sum(!is)
df |>
summarise(
n = n(),
n_miss = sum(.data[[vars[1]]] != .data[[vars[2]]], na.rm = TRUE)
) |>
transmute(acc = 1 - (n_miss / n)) |>
pull()
}
st_sfc_4326 <- partial(st_sfc, crs = 4326)
st_transform_25832 <- partial(st_transform, crs = 25832)
coords_to_points <- function(df) {
df |>
mutate(geotag = map2(long, lat, lift_vd(st_point))) |>
pull(geotag) |>
st_sfc_4326() |>
st_transform_25832()
}
nuts_to_polygons <- function(df) {
df |>
transmute(nuts = coalesce(bi_nuts_3, bi_nuts_2, bi_nuts_1)) |>
left_join(de_nuts, by = c("nuts" = "NUTS_ID")) |>
st_as_sf() |>
st_geometry() |>
st_transform_25832()
}
dist_geotag_nuts <- function(df) {
geotags <- coords_to_points(df)
nuts <- nuts_to_polygons(df)
st_distance(geotags, nuts, by_element = TRUE, which = "Euclidean")
}
# Data --------------------------------------------------------------------
twitter_db <- dbConnect(
RPostgres::Postgres(),
dbname = "twitter",
host = Sys.getenv("TWITTER_DB_HOST"),
port = Sys.getenv("TWITTER_DB_PORT"),
user = Sys.getenv("TWITTER_DB_USER")
)
# N = 26959
users_geo_both <- tbl(twitter_db, "users_twgeo_3y_nrt_nv") |>
avg_geotag() |>
inner_join(tbl(twitter_db, "users_bigeo_3y_nrt_nv"), by = "user_id") |>
collect() |>
add_nuts1_nuts2()
# Users with at least 2 geotags per NUTS-3.
# If user has geotags in different NUTS-3, take the most frequent NUTS-3
# where at least over half the geotags fall into.
# N = 21228
dbExecute(twitter_db, "
CREATE MATERIALIZED VIEW users_twgeo_3y_nrt_nv2 AS
SELECT user_id, tw_nuts_3
FROM (
SELECT *, sum(n) OVER (PARTITION BY user_id) AS nn
FROM (
SELECT user_id, tw_nuts_3, count(*) AS n
FROM tweets_geo_3y_nrt_nv_new
WHERE tw_nuts_3 IS NOT NULL
GROUP BY user_id, tw_nuts_3
) q01
) q02
WHERE (n > 1 AND n / nn > 0.5);
")
# N = 13423
users_geo_both2 <- tbl(twitter_db, "users_twgeo_3y_nrt_nv2") |>
avg_geotag() |>
inner_join(tbl(twitter_db, "users_bigeo_3y_nrt_nv"), by = "user_id") |>
collect() |>
add_nuts1_nuts2()
de_nuts <- giscoR::gisco_get_nuts(country = "Germany")
# Accuracy ----------------------------------------------------------------
acc_all(users_geo_both)
#> [1] 0.7256
acc_all(users_geo_both2)
#> [1] 0.857
map(set_names(1:3), acc_level, df = users_geo_both)
#> $`1`
#> [1] 0.8473
#>
#> $`2`
#> [1] 0.8143
#>
#> $`3`
#> [1] 0.736
#>
map(set_names(1:3), acc_level, df = users_geo_both2)
#> $`1`
#> [1] 0.9274
#>
#> $`2`
#> [1] 0.9092
#>
#> $`3`
#> [1] 0.8607
#>
# Distances ---------------------------------------------------------------
users_notacc <- users_geo_both |>
filter(eval(notacc_expr))
users_dists <- dist_geotag_nuts(users_notacc) |>
c(rep(units::set_units(0, m), nrow(users_geo_both) - nrow(users_notacc)))
# Acc@161
mean(users_dists < units::set_units(161, km))
#> [1] 0.9039
median(users_dists)
#> 0 [m]
mean(users_dists)
#> 40731 [m]
#>
#> \begin{tabular}[t]{lrrrrr}
#> \toprule
#> \multicolumn{4}{c}{ } & \multicolumn{2}{c}{Error Distance (km)} \\
#> \cmidrule(l{3pt}r{3pt}){5-6}
#> NUTS level & N & Accuracy & Accuracy@161 & Median & Mean\\
#> \midrule
#> NUTS-1 & 26959 & 84.73 & - & - & -\\
#> NUTS-2 & 25635 & 81.43 & - & - & -\\
#> NUTS-3 & 25305 & 73.60 & - & - & -\\
#> All levels & 26959 & 72.56 & 90.39 & 0 & 40.73\\
#> \bottomrule
#> \end{tabular}
#>
## Gold standard ----
users_notacc2 <- users_geo_both2 |>
filter(eval(notacc_expr))
users_dists2 <- dist_geotag_nuts(users_notacc2) |>
c(rep(units::set_units(0, m), nrow(users_geo_both2) - nrow(users_notacc2)))
# Acc@161
mean(users_dists2 < units::set_units(161, km))
#> [1] 0.9587
median(users_dists2)
#> 0 [m]
mean(users_dists2)
#> 18348 [m]
#>
#> \begin{tabular}[t]{lrrrrr}
#> \toprule
#> \multicolumn{4}{c}{ } & \multicolumn{2}{c}{Error Distance (km)} \\
#> \cmidrule(l{3pt}r{3pt}){5-6}
#> NUTS level & N users & Accuracy (\%) & Accuracy@161 (\%) & Median & Mean\\
#> \midrule
#> NUTS-1 & 13423 & 92.74 & NA & NA & NA\\
#> NUTS-2 & 12919 & 90.92 & NA & NA & NA\\
#> NUTS-3 & 12793 & 86.07 & NA & NA & NA\\
#> All levels & 13423 & 85.70 & 95.87 & 0 & 18.35\\
#> \bottomrule
#> \end{tabular}
#>
library(dplyr)
library(DBI)
twitter_db <- dbConnect(
RPostgres::Postgres(),
dbname = "twitter",
host = Sys.getenv("TWITTER_DB_HOST"),
port = Sys.getenv("TWITTER_DB_PORT"),
user = Sys.getenv("TWITTER_DB_USER")
)
dbExecute(twitter_db, "
CREATE MATERIALIZED VIEW tweets_per_nuts_3y_nrt_nv AS
SELECT geo_source, nuts_1, nuts_2, nuts_3, count(*) AS n_tweets
FROM tweets_geo_3y_nrt_nv
WHERE geo_source IS NOT NULL
GROUP BY geo_source, nuts_1, nuts_2, nuts_3;
")
dbExecute(twitter_db, "
CREATE MATERIALIZED VIEW users_per_nuts_3y_nrt_nv AS
SELECT geo_source, nuts_1, nuts_2, nuts_3, count(*) AS n_users
FROM users_geo_3y_nrt_nv
GROUP BY geo_source, nuts_1, nuts_2, nuts_3;
")
dbDisconnect(twitter_db)
library(tidyverse)
library(rvest)
library(DBI)
twitter_db <- dbConnect(
RPostgres::Postgres(),
dbname = "twitter",
host = Sys.getenv("TWITTER_DB_HOST"),
port = Sys.getenv("TWITTER_DB_PORT"),
user = Sys.getenv("TWITTER_DB_USER")
)
destatis_home <- "https://www.destatis.de"
# Gebietsstand: 2020-21-31
dl_site <- paste0(
destatis_home,
"/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/04-kreise.html"
) |>
read_html()
dl_link <- dl_site |>
html_element(".downloadLink") |>
html_attr("href") |>
. => paste0(destatis_home, .)
xlsx_name <- dl_link |>
str_extract("[^/]+\\.xlsx")
download.file(dl_link, xlsx_name)
nuts3_pop <- readxl::read_xlsx(
xlsx_name, sheet = 2, skip = 7,
col_names = c(
"destatis_id",
"type",
"name",
"nuts_3",
"area",
"pop",
"pop_male",
"pop_female",
"pop_density"
)
) |>
drop_na(nuts_3) |>
mutate(across(pop:pop_female, bit64::as.integer64))
file.remove(xlsx_name)
dbWriteTable(twitter_db, "nuts3_pop", nuts3_pop)
dbDisconnect(twitter_db)
#module GeocodedContent
using DotEnv
DotEnv.config();
using LibPQ
using Tables
using WordTokenizers: TokenBuffer, flush!, pre_process, isdone, spaces, character
using Dictionaries
include("evaluation/04-tweet-tokens/tokeniser.jl")
include("evaluation/04-tweet-tokens/utils.jl")
using DataFrames
using DataFramesMeta
#end
using Pkg
Pkg.activate(".")
include("evaluation/04-tweet-tokens/GeocodedContent.jl")
conn = LibPQ.Connection(ENV["CONN_STRING"])
table = "vocab_geo_notverified"
cursor_name = "TextCurs"
# DROP TABLE IF EXISTS $table;
execute(conn,"
CREATE TABLE $table(
token VARCHAR,
frequency Integer
);
")
LibPQ.execute(conn, "BEGIN;")
# query ="
# DECLARE $cursor_name SCROLL CURSOR FOR
# SELECT REGEXP_REPLACE(Text::varchar ,'(https*):\\/\\/([[:alnum:]|\\.|\\-|\\/|\\?|=])+',' ','gi') AS Text
# FROM Tweets
# WHERE NOT isretweet
# AND coordinates_longitude IS NOT NULL
# AND post_date BETWEEN '2018-10-15' AND '2021-10-14';
# "
query ="
DECLARE $cursor_name SCROLL CURSOR FOR
SELECT REGEXP_REPLACE(Text::varchar ,'(https*):\\/\\/([[:alnum:]|\\.|\\-|\\/|\\?|=])+',' ','gi') AS Text
FROM tweets_geo_3y_nrt_nv
WHERE geo_source IS NOT NULL;
"
# AND coordinates_longitude IS NOT NULL
qp = LibPQ.execute(conn,"
EXPLAIN
$query
") |> columntable
# query_expected_rows = 884065424 rows of full with no retweets
query_expected_rows = match(r"rows=\d+", qp[1][1]) |>
x -> convert(String, x.match) |>
x -> match(r"\d+", x) |>
x -> convert(String,x.match) |>
x -> parse(Int, x)
LibPQ.execute(conn, query)
full = execute(conn, "SELECT * from vocab_full_notverified") |>
DataFrame |>
disallowmissing! |>
x -> sort!(x, :frequency, rev = true)
vocab = Dictionary{String,Int32}()
# wordlist = Indices{String}()
wordlist = Indices(full.token)
function main()
generate_vocabulary(conn, vocab, cursor_name, 2000, 25, query_expected_rows, true, wordlist)
end
main()
LibPQ.load!(
vocab |> pairs |> rowtable,
conn,
"INSERT INTO $table (token, frequency) VALUES (\$1, \$2);",
)
LibPQ.execute(conn, "COMMIT;")
close(conn)
"""
tokenise(input::String; transform_to_lower_case = true)
Turns a sentence into tokens. Split happens on spaces, afterwards non-unicode characters are removed. Does not deal with URLs as these are removed by PostgresSQL.
"""
function tokenise(input::String; transform_to_lower_case = true)::Vector{String}
if transform_to_lower_case == true
input = lowercase(input)
end
input = pre_process(input, true, true) # input, strip_handle, reduce_len
tb = TokenBuffer(input)
while !isdone(tb)
spaces(tb) && continue
_nonunicodecharacters(tb) ||
character(tb)
end
tb.tokens[tb.tokens .|> length .!= 1]
return tb.tokens
end
"Removes all non-unicode characters from a token."
function _nonunicodecharacters(tb::TokenBuffer)
Base.Unicode.category_code(tb[]) != 2 && tb[tb.idx] != '#' || return false
#cc != 2 && tb[tb.idx] != '#' || return false
flush!(tb)
tb.idx += 1
return true
end
\ No newline at end of file
"""
parse_tweets!(tweets::Vector{String}, vocab::Dictionary{String,Int32})
Tokenises a vector of sentences (e.g. tweets) and updates an already initialized Dictionary with the respective counts for each extracted token.