Skip to content

Commit

Permalink
uploading functional project
Browse files Browse the repository at this point in the history
  • Loading branch information
teachable committed Aug 21, 2018
1 parent 8fab458 commit a630de9
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 80 deletions.
40 changes: 16 additions & 24 deletions src/crawler/crawlerS.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,8 @@ public class crawlerS {
static ArrayList<String> docExt = new ArrayList<String>(); //where we'll be placing the file extension of the documents
static ArrayList<String> docs = new ArrayList<String>(); //the <a> links that are actually documents
static ArrayList<String> docTitles = new ArrayList<String>(); //used for folder names
static ArrayList<String> docGroups = new ArrayList<String>(); //used folder grouping later
static String posExt[] = {"pdf" , "msw12", "excel12book" };//possible attachment extensions
static String realExt[] = {"pdf" , "docx" , "xlsx"};
static String posExt[] = {"pdf" , "msw12" , "excel12book" , "crtext"};//possible attachment extensions
static String realExt[] = {"pdf" , "docx" , "xlsx" , "txt"};
static boolean nPage = true; //bool to verify if there is a next page
static ChromeOptions options = new ChromeOptions();
static String[] urlSplit; //where we place the document id
Expand Down Expand Up @@ -92,14 +91,7 @@ public static void crawl(String url, WebDriver crawler) throws InterruptedExcept
main.print(docs);
main.print("Now we start looking for the attachments");
compileList(crawler, docs);
down.loadAll(attLinks,attTitles,docExt,docGroups,docTitles);
/* int a=0;
for(String b : docLinks) { //for each doc in the documents string...
int downProg = 100-progress;
progress = progress+((downProg)*(a/docLinks.size()));
down.load(b, docTitles.get(a), docExt.get(a), docGroups.get(a));
a++;
}*/
down.loadAll(attLinks,attTitles,docExt,docTitles);
main.print("Should've downloaded all attachments listed.");
crawler.quit();
JOptionPane.showMessageDialog(draw.urlF, "We should've downloaded all attachments in the Docket Browser for this document!", "Done", JOptionPane.INFORMATION_MESSAGE);
Expand Down Expand Up @@ -138,8 +130,8 @@ public static void getLinks (WebDriver crawler) {
nextPage(crawler);
}

public static void percentArch(int a,int b) {
progress = (a/b)*80;
public static int percentArch(int a,int b) {
return (a/b)*80;
}

public static void compileList(WebDriver crawler, ArrayList<String> docs) {
Expand All @@ -154,28 +146,24 @@ public static void compileList(WebDriver crawler, ArrayList<String> docs) {
for(String ext : posExt) { //looking for files of every extension type
List<WebElement> loadAtt = wait.until(ExpectedConditions.visibilityOfAllElementsLocatedBy((By.cssSelector("a[href*='=" + ext + "']")))); //waiting for column to show
main.print("We've found files with the extension of: " + realExt[i]);
int attPer = 0; //attachments on this page for folder grouping later
for(WebElement att : loadAtt) {
try {
main.print("Found the attachment, adding to link collection");
attLinks.add(att.getAttribute("href"));
WebElement rTitle = wait.until(ExpectedConditions.presenceOfElementLocated((By.xpath("/html/body/div[3]/div[2]/div[2]/div[2]/div/div/div[1]/div[1]/h1"))));
try {
attLinks.add(att.getAttribute("href")); //adding download link
WebElement rTitle = wait.until(ExpectedConditions.presenceOfElementLocated((By.xpath("/html/body/div[3]/div[2]/div[2]/div[2]/div/div/div[1]/div[1]/h1"))));
docTitles.add(rTitle.getText()); //
try { //titles are nested weirdly so we're ripping em
WebElement parentElement = att.findElement(By.xpath("./..")); //getting to grandparentto try to
WebElement grandElement = parentElement.findElement((By.xpath("./.."))); //find the parent to find the title
WebElement title = grandElement.findElement(By.xpath("//h3")); //if its an attachment listed an alt way
rTitle = title;
attPer++; //haha lets hope this works
} catch(NoSuchElementException e) {
main.print("Defaulting to document title for file name...");
attPer++; //we just don't change the title now lmao
}
attTitles.add(rTitle.getText()); //adding the text of the above element to our list of document titles
docExt.add(realExt[i]); //adding the extension, filtered because this website is weird with extensions
percentArch(i,docs.size());
docGroups.add(Integer.toString(attPer));
attPer=0;
main.print("We are " + progress + "% percent done with checking these links :)");
// progress = percentArch(i,docs.size());
// main.print("We are " + progress + "% percent done with checking these links :)");
} catch(InvalidSelectorException e) {
main.print("Couldn't find a document with an extension of \"" + ext + "\" on this page. \n Continuing...");
}
Expand All @@ -184,7 +172,11 @@ public static void compileList(WebDriver crawler, ArrayList<String> docs) {
}
} catch (TimeoutException e) {
main.print("Couldnt find document with current extension, continuing.");
attLinks.add("No Att");
attLinks.add(null);
docExt.add(null);
attTitles.add(null);
docTitles.add(null);

}
}
}
Expand Down
64 changes: 28 additions & 36 deletions src/crawler/down.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,48 +15,39 @@
import java.util.zip.ZipFile;

public class down{
//attachment links | attachment titles | extension of attachments | the folder they go in
public static void loadAll(ArrayList<String> links, ArrayList<String> titles, ArrayList<String> ext, ArrayList<String> folds) {
int i = 0;
new File("Attachments").mkdir(); //make the attachments folder
for(String url : links) {
if(!(url == null)) {
main.print("Referencing index " + i + " out of " + links.size());
load(url, titles.get(i), ext.get(i), folds.get(i));
}
i++;
}
}

public static void loadAll(ArrayList<String> links, ArrayList<String> titles, ArrayList<String> ext, ArrayList<String> groups, ArrayList<String> folds) {
String folder = groups.toString().replaceAll("[\\[\\]\\s]", ""); //what folder are we on?
int[] groupNumb = Arrays.stream(folder.split(",")).mapToInt(Integer::parseInt).toArray(); //grouping of attachments as ints
int document = 0; //what document in that directory are we on?
int groupCount = 0; //what docs-per-folder are we comparing this number to?
int count = 0; //what number are we in terms of docs-per-folder?
for(String url : links) {
if(!url.equals("No Att")) {
if(count < groupNumb[document]) { //if there's still more files to fit in this folder
try {
load(url, titles.get(count), ext.get(count), folds.get(groupNumb[groupCount])); //try to download the file
count++;
} catch (IOException e) {
main.print("failed to DL attachment, continuing...");
count++;
e.printStackTrace();
}
}
}
else {
document++;
}
if(count==groupNumb.length-1) {
groupCount++;
}
}
}

public static void delete(File file) {
file.delete();
}

public static void load(String link, String title, String ext, String fold) throws IOException { //goes through list of pdfs, downloads
public static void load(String link, String title, String ext, String fold) { //goes through list of pdfs, downloads
main.print("Downloading file titled " + title);
String T = title.replaceAll("[\\\\/:*?\"<>|]", "_") + "." + ext; //we're sanitizing the title
new File("Attachments" + fold).mkdirs(); //making a folder for it
URL website = new URL(link); //makes a URL from the first string passed
ReadableByteChannel rbc = Channels.newChannel(website.openStream()); //opens the given url as a stream of bytes
FileOutputStream fos = new FileOutputStream(new File("Attachments/" + fold + "/" + T)); //create new fileoutput stream, file name as the stored PDF title"
fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); //writes the above bytes to the file output stream
fos.close(); //closes file output stream
try {
String T = title.replaceAll("[\\\\/:*?\"<>|]", "_"); //we're sanitizing the title
new File("Attachments/" + fold).mkdirs();
File attachment = File.createTempFile("Attachments/" + fold + "/" + T, ext);
URL website = new URL(link); //makes a URL from the first string passed
ReadableByteChannel rbc = Channels.newChannel(website.openStream()); //opens the given url as a stream of bytes
FileOutputStream fos = new FileOutputStream(attachment); //create new fileoutput stream, file name as the stored PDF title"
main.print(new File("Attachments/" + fold + "/" + T).getAbsolutePath());
fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); //writes the above bytes to the file output stream
fos.close(); //closes file output stream
} catch (IOException e) {
main.print("failed to DL attachment, continuing...");
e.printStackTrace();
}
}

public static void unzip(String a) throws IOException { //utility for unzipping
Expand Down Expand Up @@ -86,6 +77,7 @@ public static void unzip(String a) throws IOException { //utility for unzipping
while ((length = is.read(bytes)) >= 0) {
fos.write(bytes, 0, length);
}

is.close();
fos.close();

Expand Down
40 changes: 20 additions & 20 deletions src/crawler/draw.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
public class draw extends JPanel implements ActionListener {

String userU;
JButton go = new JButton("Get PDFs");
JButton go = new JButton("Start");
static JTextField urlF = new JTextField(20);
static JProgressBar progressBar = new JProgressBar(0, 100);

Expand Down Expand Up @@ -57,22 +57,22 @@ public void actionPerformed(ActionEvent e) {
userU = urlF.getText();
main.print(userU);
try {
new Thread(new Runnable(){
public void run(){
int x = 0;
while(x<=100) {
x = crawlerS.progress;
progressBar.setValue(x); // Setting incremental values
if (x == 100 ){
progressBar.setString("Done with the download!"); // End message
try{
Thread.sleep(200);
}catch(Exception ex){
}
}
}
}
}).start();
// new Thread(new Runnable(){
// public void run(){
// int x = 0;
// while(x<=100) {
// x = crawlerS.progress;
// progressBar.setValue(x); // Setting incremental values
// if (x == 100 ){
// progressBar.setString("Done with the download!"); // End message
// try{
// Thread.sleep(200);
// }catch(Exception ex){
// }
// }
// }
// }
// }).start();
crawlerS.initC(userU); //passes the url to the crawlerS class so it can start searching
} catch (InterruptedException malf) {
main.print("Malformed URL");
Expand Down Expand Up @@ -102,12 +102,12 @@ public void focusLost(FocusEvent e) {
});
urlF.setText("Place document URL here");

progressBar.setValue(0);
progressBar.setStringPainted(true);
// progressBar.setValue(0);
// progressBar.setStringPainted(true);

add(urlF, BorderLayout.CENTER);
add(go, BorderLayout.SOUTH);
add(progressBar,BorderLayout.NORTH);
// add(progressBar,BorderLayout.NORTH);

JFrame test = new JFrame();

Expand Down

0 comments on commit a630de9

Please sign in to comment.