(declaim (optimize debug safety)) (eval-when (:compile-toplevel :load-toplevel :execute) (require :ftp) (require :inflate "~/lisp/external/inflate-stream")) (defpackage :egome (:export #:surf #:create-index) (:use :common-lisp :net.ftp.client :util.zip)) (in-package :egome) ;; defaults for FTP'ing the NCBI human protein dataset (defparameter *human-protein-ftp-site* "ftp.ncbi.nlm.nih.gov") (defparameter *human-protein-ftp-filename* "genomes/H_sapiens/protein/protein.fa.gz") (setq net.ftp.client:*default-user* "anonymous") (setq net.ftp.client:*default-password* "Larry.Hunter@uchsc.edu") ;; The amino acid characters: (defconstant +aa-chars+ "ACDEFGHIKLMNPQRSTVWY") ;; Proteins have a header and a sequence. (defstruct protein header sequence) ;; A hit is a location in a protein. (defstruct hit protein start) ;; The *kmer-table* is a hash table which indexes a list of hits by a kmer string. (defvar *kmer-table* (make-hash-table :test #'equal)) (defun kmer-hits (key) (gethash key *kmer-table*)) (defun index-kmer (key value) (push value (gethash key *kmer-table*))) ;; *k-min* *k-max* are the shortest and longest subsequences to be indexed. (defparameter *k-min* 4) (defparameter *k-max* 5) ;; EGOME:SURF is the main interface to this code. It takes a name as input and ;; returns all the exact matches in the proteome. ;; It use a blast-like strategy to find all exact matches of a query. If the ;; name is within the subseq size range, just return the entries for it. If it ;; is longer, use the max length initial subsequence of the name and try to ;; extend it. Can't find entries shorter than the minimum... (defun surf (name) (setq name (string-upcase name)) (let ((length (length name)) (matches nil)) (cond ((illegal-chars name) (format t "~%Name contains characters not in the amino acid alphabet:~{ ~a~}" (illegal-chars name))) ((< length *k-min*) (format t "~%Didn't index names shorter than ~d characters" *k-min*)) ((> length *k-max*) (let ((prefix (subseq name 0 *k-max*))) (dolist (prefix-hit (kmer-hits prefix)) (let ((sequence (protein-sequence (hit-protein prefix-hit)))) (when (string-equal name sequence :start2 (hit-start prefix-hit) :end2 (min (length sequence) (+ (hit-start prefix-hit) length))) (push prefix-hit matches)))))) (t (setq matches (kmer-hits name)))) (if matches (dolist (match matches) (format t "~%Position ~d in ~a" (hit-start match) (protein-header (hit-protein match)))) (format t "~%No matches found")) matches)) ;; EGOME:CREATE-INDEX builds the table necessary for EGOME:SURF to work. ;; Main loop reads FASTA records from a file and indexes them. (defun create-index (&key max-reads) (let ((stream (open-proteins-stream))) (unwind-protect ; make sure to close stream (do ((protein (read-fasta stream) (read-fasta stream)) (reads 0 (1+ reads))) ((or (null protein) (and max-reads (= reads max-reads))) 'done) (index-protein protein)) (close-proteins-stream stream)))) ;; Efficient fasta reader. Allocate sequences adjustable as vectors, adjusted in ;; units of 100. Ignore newlines. Return nil if we try to read at EOF. (defun read-fasta (stream) (let ((sequence (make-array 100 :adjustable t :fill-pointer 0 :element-type 'character)) (header (read-line stream nil nil))) (when header (do ((peek (peek-char nil stream nil 'eof) (peek-char nil stream nil 'eof))) ((or (eq 'eof peek) (char= peek #\>)) (make-protein :header header :sequence (string-upcase sequence))) (let ((char (read-char stream))) (unless (char= #\newline char) (vector-push-extend char sequence 100))))))) ;; To index a protein, but it in the ID->protein map and then index all the ;; subsequences between subseq-min-size and subseq-max-size (defun index-protein (protein) (let ((sequence (protein-sequence protein))) (dotimes (i (1+ (- (length sequence) *k-min*))) (dotimes (j (1+ (- *k-max* *k-min*))) (unless (> (+ i j *k-min*) (length sequence)) (index-kmer (subseq sequence i (+ i j *k-min*)) (make-hit :protein protein :start i))))))) ;; Queries might have illegal characters, and it would be nice to report which. (defun illegal-chars (sequence) (let ((illegals nil)) (dolist (char (coerce sequence 'list) illegals) (unless (find char +aa-chars+ :test #'char-equal) (push char illegals))))) ;; open-proteins-stream. The information necessary downloaded from NCBI to a ;; temp file. If a temp file already exists and is at least as new as the ;; current NCBI file, use it. Handles compressed files properly. This stream ;; must be closed explicitly with close-proteins-stream (defun open-proteins-stream () (let ((pathname (make-pathname :directory "/tmp" :defaults *human-protein-ftp-filename*))) (format t "~%Checking for update of cached proteins file") (if (probe-file pathname) (with-open-ftp-connection (ftp *human-protein-ftp-site*) (let ((local-write (file-write-date pathname)) (remote-write (ftp-stream-file-mod-time ftp *human-protein-ftp-filename*))) (when (> remote-write local-write) (delete-file pathname) (format t "~%Getting more recent version of human proteins from NCBI") (ftp-stream-get ftp *human-protein-ftp-filename* pathname)))) (progn (format t "~%Getting human proteins from NCBI") (ftp-get *human-protein-ftp-site* *human-protein-ftp-filename* pathname))) (let ((raw (open pathname))) (if (string-equal "gz" (pathname-type pathname)) (progn (format t "~%Uncompressing.") (skip-gzip-header raw) (make-instance 'inflate-stream :input-handle raw)) raw)))) (defun close-proteins-stream (stream) (when (typep stream 'inflate-stream) (close (slot-value stream 'excl::input-handle))) (close stream))