Doppelte imdb IDs finden
find . -maxdepth 1 | grep -o -E '#tt.*' | sort | uniq -d | xargs -I"{}" -n 1 sh -c "echo *{}"
# virtualenv
$ pip install IMDbPY
$ imdbpy2sql.py -d imdb -u "sqlite:///`pwd`/imdb.db" --sqlite-transactions
postgres
wget -q -O - 'ftp://ftp.sunet.se/pub/tv+movies/imdb/movies.list.gz' \
| gzip -d -c - \
| awk '{
split($0,A,"\t*");
if ( A[2] ~ /^[0-9]+$/) {
print A[1] "|" A[2];
}
}' \
| psql -h 10.2.2.13 -c "
SET CLIENT_ENCODING TO 'LATIN1';
TRUNCATE TABLE movies;
COPY movies FROM STDIN DELIMITER '|'" imdb
sqlite
#!/bin/sh
cache="`dirname \"$0\"`/cache"
test -d "${cache}" || mkdir "${cache}"
get_file() {
name="$1"
if [ -f "${cache}/${name}" ]; then
cat "${cache}/${name}"
else
wget -O - "ftp://ftp.fu-berlin.de/pub/misc/movies/database/${name}" \
| tee "${cache}/${name}"
fi
}
log() {
echo "`date` $*"
}
log "import start movies.list.gz"
echo "CREATE TABLE movies (title text, year int);" \
| sqlite3 imdb.db \
&& get_file movies.list.gz \
| zcat \
| sed -n -e 's/^\([^\t]\+\)\t\+\([0-9]\+\)$/\1|\2/p' \
| sqlite3 imdb.db '.import /proc/self/fd/0 movies' \
&& log "import done movies.list.gz" \
|| log "skip movies.list.gz"
log "import start release-dates.list.gz"
echo "CREATE TABLE release_dates (title text, release text);" \
| sqlite3 imdb.db \
&& get_file release-dates.list.gz \
| zcat \
| sed -n -e 's/^\([^\t]\+\)\t\+\(.\+\)$/\1|\2/p' \
| sqlite3 imdb.db '.import /proc/self/fd/0 release_dates' \
&& log "import done release-dates.list.gz" \
|| log "import skip release-dates.list.gz"
exact match rename
find . -maxdepth 1 -not -name '*#*' | while read line; do file=${line#./}; newfile=`helper2.sh e "$file" 2>/dev/null` && echo mv \"$file\" \"$newfile\" || echo fail $file; done
#!/bin/bash
filename_cache() {
cachedir=`dirname "$0"`/.movie_cache
test -d "$cachedir" || mkdir -p "$cachedir"
echo "${cachedir}/`echo "$1" | md5sum | cut -d " " -f 1`"
}
fetch_cache() {
url=$1
read cache_filename < <( filename_cache "${url}" )
echo $cache_filename
if ! [ -f "${cache_filename}" ]; then
if ! wget -O "${cache_filename}" -q "${url}"; then
echo "Failed to fetch $url" >&2
rm -f "${cache_filename}"
return 1
elif [ `du "${cache_filename}" | cut -f 1` -eq 0 ]; then
echo "Failed to fetch $url" >&2
rm -f "${cache_filename}"
return 1
fi
fi
cat "${cache_filename}"
return 0
}
case "$1" in
e*)
title="$2"
url="http://www.imdb.com/find?q=$title;tt=off"
fetch_cache "$url" >/dev/null
file=`filename_cache "$url"`
if grep -E 'og:url.*http:\/\/www.imdb.com\/title\/tt' "$file" >/dev/null; then
#titlepage
read id < <( sed -n -e 's/.*link rel="canonical" href="http:\/\/www.imdb.com\/title\/\(tt[0-9]*\).*/\1/p' < "$file" )
read title < <( sed -n -e 's/<title>\(.*\) - IMDb<\/title>/\1/p' < "$file")
echo "$title #$id"
else
#search
echo "no exact match" >&2
exit 1
fi
;;
esac