#!/usr/local/bin/perl use strict; use Switch; # # downsample.pl by William Pu # # perl downsample.pl -i infile -o outfile [-p PROB -l LINES] # downsample a file by retaining a line with probability PROB (range 0-1); default = 1 (retain all); # take the lines in blocks LINES. For example, fastq is in blocks of 2, bowtie is in blocks of 1; default = 1. my $PROB = 1; my $LINES = 1; my $infile = ""; my $outfile = ""; # read parameters, assign output variables if ($ARGV[0] eq "") { report_error("usage: downsample.pl -i infile -o outfile [-p PROB -l LINES]\n"); } my $strip_str = ""; for (my $i=0;$i<@ARGV;$i++) { switch($ARGV[$i]) { case "-i" { $i++; $infile = $ARGV[$i]; } case "-o" { $i++; $outfile = $ARGV[$i]; } case "-p" { $i++; $PROB = $ARGV[$i]; } case "-l" { $i++; $LINES = $ARGV[$i]; } else { report_error("invalid parameter " . $ARGV[$i]) } } } if ($infile eq "" || $outfile eq "") { report_error("invalid input or output file\n"); } print $infile." ".$outfile."\n"; open(INFILE, $infile) or die; open(OUTFILE,">".$outfile); my $j; my $str=; while ($str) { if (rand() < $PROB) { # keep the block for ($j=0; $j<$LINES; $j++) { print OUTFILE $str; $str=; } } else { # discard the block for ($j=0; $j<$LINES; $j++) { $str=; } } } # close files close INFILE; close OUTFILE; exit; sub report_error { print "@_\n"; die; }