To know your shell better:
$ env
$ man bash
$ info man
#a bit more detail than man pages
STDIN, STDOUT, STDERR
$ curl fakeurl
# print error message to screen
$ curl fakeurl 2> errs.txt
# redirecting STDERR with 2>
$ curl google.com fakeurl 1> out1.txt 2> out2.txt
# redirecting STDIN and STDOUT with 1> and 2>
Command 101
$ touch file1
# not only used for create zero byte file but also change timestamp without affect the content
$ echo line1 > lines.txt
# add text to file via “>”, rewrite it
$ echo line2 >> linex.txt
# append text to file via “>>”
$ wc file
# word count, output will be line, word, byte, file name unless specifiy with arg
$ wc -l file
# only number of lines
$ wc -L file
# the longest line length
$ yes | nl | head -100 > demo3.txt
# nl: line number filtering, debugging and create quick datasets
$ nl *gbk | tail -l
# show the last line with line number
Text progressing
$ less file
# view large file, can scroll up and down with j and k or ctrl+n and ctrl+p
head
$ head -n2
; $ head -50
# display the first n line in the prompt, by default is 10
$ head -q *txt *gbk
# heads of multiple files w/o delimiters
$ head -c50 *gbk
# first 50 characters
tail
$tail -n2 out1.txt
# display the last lines, by default 10 lines
$tail -n+2 out1.txt
# start at second lines of the file
$ yes | nl | head -n 10000000 > foo &
$ tail -F foo
# run process in background and follow end of file
cat
$ cat demo.txt demo.txt demo.txt > demo2.txt
# print/concatenate files, industrial strength file viwer, large file or file with weird bytes
$ yes | head | cat - demo2.txt
# cat the input before the text in demo2.txt on the screen, didn’t change the file; cat demo2.txt - is after
cut
$ cut -f2,5 *ptt | head
# pull out columns of a file, 2nd and 5th column in this one
$ cut -c1-5 *ptt | head
# pull out columns by characters, from 1st to 5th
paste
$ tail -n+3 *ptt | cut -f1 > locs
$ tail -n+3 *ptt | cut -f5 > genes
$ locs genes locs | head
# paste by columns
sort
$ sort genes | less
# default is alphabetic order
$ sort -r genes | less
# reverse
$ sort -R genes | less
# randomize
$ cut -f1 *ptt | tail -n+4 | sort | head
# used in pipe
$ sort -k1 -rn
# -k: start a key at POS1 ; -r:reverse the result of comparisons; -n:numberic sort, compare according to string numberical value
uniq
$ cut -f2 *ptt | tail -n+4 | sort | uniq -c | sort -k1 -rn
#report or filter out repeated lines in a file; with -c, precede each output line with the count of the number of times the line occurred in the input, followed by a single space.
split
$ split -d -l 1000 *ptt subset.ptt.
# in linux, split has -d for using numeric suffixes instead of alphabetic; split file per 1000 lines
Downloading and Syncing
wget
$ wget -w 2 -r -np -k -p http:www.stanford.edu/class/cs106
# recursively download an entire site, waiting 2 seconds between hits
-np(–no-parent): it guarantees that only the files below a certain hierarchy will be downloaded.
-k(–convert-links): After the download is complete, convert the links in the document to make them suitable for local viewing. This affects not only the visible hyperlinks, but any part of the document that links to external content, such as embedded images, links to style sheets, hyperlinks to non-HTML content, etc.
-p (–page-requisites): This option causes Wget to download all the files that are necessary to properly display a given HTML page. This includes such things as inlined images, sounds, and referenced stylesheets.
$ wget --help | less
# to know more
rsync
Synchronize local and remote files, transfer files between two machines.
-avp : archive, versbose, progress
1 2 3 4 5 6 7 8 9 10 11 12 |
|
“The rsync remote-update protocol allows rsync to transfer just the differences between two sets of files across the network connection, using an efficient checksum-search algorithm described in the technical report that accompanies this package.”
curl
For interacting with singl url
$ GHUSER="lynneq"
# init a variable
$ curl -i https://api.github.com/uers/$GHUSER/$GHVAR
# an api call with variables
System, process information
$ uname -a
$ hostname
# determine name of the current machine
$ whoami
# current user
$ sleep 10 &
$ ps xw | grep sleep | grep -v "grep"
# show process are running and grep the key word ; grep -v flag to exclude any lines that contained the string grep, “foogrep” won’t be showed up
$ kill 11405
# kill process by its id
$ sleep 60 &
$ kill `ps xw | grep sleep | cut -f1 -d ' ' | head -1` # kill the process just started
$ top
#most important processes
Storage and finding
$ tar -cvf genome.tar genome
# make an archive of files as .tar
$ gzip genome.tar
# compress the file as .tar.gz
$ tar -xzvf genome.tar.gz
# unzip the file
$ gunzip genome.tar.gz
# decompress the file from tar.gz to tar
$ zcat xx.fa.gz
# indentical to gunzip -c
$ find /etc | nl
# find a file, non-indexed search
$ sudo apt-get install -y locate
$ sudo updatedb
$ locate fstab
# find a file, indexed
$ df -h
# df: system disk space, on mac with -h human readable
$ du --max-depth=1 -b | sort -k1 -rn
# du: directory utilization. determine which subdirectories are taking up a lot of disk space
grep
Searching within the files:
$ grep -l protein *ptt # --files-with-matches, only the names of files containing selected lines are written to standard output.
NC_007779.ptt
$ grep -B 1 -A 1 protein *ptt
# print context that 1 line ‘B’efore and ‘A’fter where the match found
$ grep 'protein.*' *ptt | sort | uniq
# regular expression