Doppelte imdb IDs finden

find . -maxdepth 1 | grep -o -E '#tt.*'  | sort | uniq -d | xargs -I"{}" -n 1 sh -c "echo *{}"
# virtualenv
$ pip install IMDbPY
$ imdbpy2sql.py -d imdb -u "sqlite:///`pwd`/imdb.db" --sqlite-transactions

postgres

wget -q -O - 'ftp://ftp.sunet.se/pub/tv+movies/imdb/movies.list.gz' \
            | gzip -d -c - \
            | awk '{
                  split($0,A,"\t*");
                  if ( A[2] ~ /^[0-9]+$/) {
                    print A[1] "|" A[2];
                  }
                }' \
            | psql -h 10.2.2.13 -c "
                SET CLIENT_ENCODING TO 'LATIN1';
                TRUNCATE TABLE movies;
                COPY movies FROM STDIN DELIMITER '|'" imdb

sqlite

#!/bin/sh
cache="`dirname \"$0\"`/cache"
test -d "${cache}" || mkdir "${cache}"
get_file() {
    name="$1"
    if [ -f "${cache}/${name}" ]; then
        cat "${cache}/${name}"
    else
        wget -O - "ftp://ftp.fu-berlin.de/pub/misc/movies/database/${name}" \
            | tee "${cache}/${name}"
    fi  
}

log() {
    echo "`date` $*"
}


log "import start movies.list.gz"
echo "CREATE TABLE movies (title text, year int);" \
    | sqlite3 imdb.db \
    && get_file movies.list.gz \
    | zcat \
    | sed -n -e 's/^\([^\t]\+\)\t\+\([0-9]\+\)$/\1|\2/p' \
    | sqlite3 imdb.db '.import /proc/self/fd/0 movies' \
    && log "import done movies.list.gz" \
    || log "skip movies.list.gz"

log "import start release-dates.list.gz"
echo "CREATE TABLE release_dates (title text, release text);" \
    | sqlite3 imdb.db \
    && get_file release-dates.list.gz \
    | zcat \
    | sed -n -e 's/^\([^\t]\+\)\t\+\(.\+\)$/\1|\2/p' \
    | sqlite3 imdb.db '.import /proc/self/fd/0 release_dates' \
    && log "import done release-dates.list.gz" \
    || log "import skip release-dates.list.gz"

exact match rename

find . -maxdepth 1 -not -name '*#*' | while read line; do file=${line#./}; newfile=`helper2.sh e "$file" 2>/dev/null` && echo mv \"$file\" \"$newfile\" || echo fail $file; done

#!/bin/bash

filename_cache() {
    cachedir=`dirname "$0"`/.movie_cache
    test -d "$cachedir" || mkdir -p "$cachedir"
    echo "${cachedir}/`echo "$1" | md5sum | cut -d " " -f 1`"
}

fetch_cache() {
    url=$1
    read cache_filename < <( filename_cache "${url}" )
    echo $cache_filename
    if ! [ -f "${cache_filename}" ]; then 
        if ! wget -O "${cache_filename}" -q "${url}"; then
            echo "Failed to fetch $url" >&2
            rm -f "${cache_filename}"
            return 1
        elif [ `du "${cache_filename}" | cut -f 1` -eq 0 ]; then
            echo "Failed to fetch $url" >&2
            rm -f "${cache_filename}"
            return 1
        fi      
    fi      
    cat "${cache_filename}"
    return 0
}


case "$1" in
    e*)
        title="$2"
        url="http://www.imdb.com/find?q=$title;tt=off"
        fetch_cache "$url" >/dev/null
        file=`filename_cache "$url"`
        if grep -E 'og:url.*http:\/\/www.imdb.com\/title\/tt' "$file" >/dev/null; then
             #titlepage
            read id < <( sed -n -e 's/.*link rel="canonical" href="http:\/\/www.imdb.com\/title\/\(tt[0-9]*\).*/\1/p' < "$file" )
            read title < <( sed -n -e 's/<title>\(.*\) - IMDb<\/title>/\1/p' < "$file")
            echo "$title #$id"
        else
            #search
            echo "no exact match" >&2
            exit 1
        fi      
    ;;          
esac