Добавил:
Upload Опубликованный материал нарушает ваши авторские права? Сообщите нам.
Вуз: Предмет: Файл:
Main.docx
Скачиваний:
1
Добавлен:
17.12.2018
Размер:
413.7 Кб
Скачать

Приложение б.2

Функция фильтрации

public ArrayList<Object[]> Filter(ArrayList<String> Dirty){

// filter for word array

ArrayList<String> CleanArray = new ArrayList<String>();

// source array

ArrayList<String> FinalList = new ArrayList<String>();

// after all filters excepting duplicate filter

ArrayList<Object[]> RatedFinalList = new ArrayList<Object[]>();

// after all filters

WordBase = GetWordBase();

for (String CurWord : Dirty){

if (CurWord.length() > 2){

// delete very small words

if (WordBase.contains(CurWord) == false & Bases[0].contains(CurWord) == false){

// if the word is new and not a name

CleanArray.add(CurWord);

}

}

}

System.out.println("Count before filters: "+CleanArray.size());

for (String CurString : CleanArray){

// cut 2nd form

if (Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_ed"))

& CurString.endsWith("ed") & CurString.length() > 3){

String CutD = CurString.substring(0, CurString.length()-1);

// for words like compare - compared

String CutED = CurString.substring(0, CurString.length()-2);

// for words like enter - entered

String CutDoubles = CurString.substring(0, CurString.length()-3);

// for words like stop - stopped

if (CleanArray.contains(CutD) == false

& WordBase.contains(CutD) == false){

if(Bases[1].contains(CutD) & CutD.length() > 2){

FinalList.add(CutD);

} else

if(CleanArray.contains(CutED) == false

& WordBase.contains(CutED) == false){

if(Bases[1].contains(CutED) & CutED.length() > 2){

FinalList.add(CutED);

} else

if(CurString.charAt(CurString.length()-3) == CurString.charAt(CurString.length()-4)){

if(CleanArray.contains(CutDoubles) == false

& WordBase.contains(CutDoubles) == false){

if(Bases[1].contains(CutDoubles) & CutDoubles.length() > 2){

FinalList.add(CutDoubles);

} else FinalList.add(CurString);

}

} else FinalList.add(CurString);

}

}

} else

// cut 3rd person and plural form

if (Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_s"))

& CurString.endsWith("s")){

String CutS = CurString.substring(0, CurString.length()-1);

String CutES = CurString.substring(0, CurString.length()-2);

String CutIES = CurString.substring(0, CurString.length()-3);

if (CurString.endsWith("sses")){ // like bosses

if(CleanArray.contains(CutES) == false & WordBase.contains(CutES) == false){

if (Bases[1].contains(CutES) & CutES.length() > 2){

FinalList.add(CutES);

// correct derivatives (bosses -> boss)

} else FinalList.add(CurString);

}

} else

// for words like category/categories

if(CurString.endsWith("ies") & (CleanArray.contains(CutIES+"y")

|| Bases[1].contains(CutIES+"y"))){

FinalList.add(CutIES+"y");

} else

if (CleanArray.contains(CutS) == false & WordBase.contains(CutS) == false){

if (Bases[1].contains(CutS) & CutS.length() > 2){

FinalList.add(CutS); // correct derivatives (gets -> get)

} else FinalList.add(CurString);

}

} else

if (Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_ing"))

& CurString.endsWith("ing")){

String CutIng = CurString.substring(0, CurString.length()-3);

String CuttIng = CurString.substring(0, CurString.length()-4);

//for words like make/making

if (CleanArray.contains(CutIng) == false & WordBase.contains(CutIng) == false){

if(CutIng.length() > 2 & (CleanArray.contains(CutIng+"e") || Bases[1].contains(CutIng+"e"))){

if(WordBase.contains(CutIng+"e") == false ){

FinalList.add(CutIng+"e");

}

} else

if (Bases[1].contains(CutIng) & CutIng.length() > 1){

FinalList.add(CutIng);

} else

if(CurString.length() > 6){

if(CurString.charAt(CurString.length()-4) == CurString.charAt(CurString.length()-5)

& WordBase.contains(CuttIng) == false){

FinalList.add(CuttIng);

}

} else

{

FinalList.add(CurString);

}

}

} else

// cut adverbs

if (Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_ly"))

& CurString.length() > 4 & CurString.endsWith("ly")){

String Cut = CurString.substring(0, CurString.length()-2);

if(CurString.endsWith("cally")) {

// for words like basically

Cut = CurString.substring(0, CurString.length()-4);

if(CleanArray.contains(Cut) == false

& WordBase.contains(Cut) == false){

if (Bases[1].contains(Cut) & Cut.length() > 2){

FinalList.add(Cut);

} else FinalList.add(CurString);

}

} else

if(CleanArray.contains(Cut) == false

& WordBase.contains(Cut) == false){

if (Bases[1].contains(Cut) & Cut.length() > 2){

FinalList.add(Cut);

} else FinalList.add(CurString);

}

} else

if (Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_er")) & CurString.length() > 4

& (CurString.endsWith("er") || CurString.endsWith("est"))){

//for words like big/bigger/biggest

if(CurString.endsWith("er")){

String CutER = CurString.substring(0, CurString.length()-2);

String CuttER = CurString.substring(0, CurString.length()-3);

if (CleanArray.contains(CutER) == false & WordBase.contains(CutER) == false){

// for regular case (great/greater)

if (Bases[1].contains(CutER) & CutER.length() > 1){

FinalList.add(CutER);

} else

// for words like big/bigger with double letter

if(CurString.length() > 4 & CurString.charAt(CurString.length()-3)

== CurString.charAt(CurString.length()-4) & WordBase.contains(CuttER) == false

& Bases[1].contains(CuttER)){

//out(CurString);

FinalList.add(CuttER);

} else

// for words like easy/easier

if(CurString.charAt(CurString.length()-3) == (char) 'i'

& (CleanArray.contains(CuttER+"y") || Bases[1].contains(CuttER+"y"))){

FinalList.add(CuttER+"y");

//out(CurString);

} else

{

FinalList.add(CurString);

}

}

} else

if(CurString.endsWith("est")){

String CutEST = CurString.substring(0, CurString.length()-3);

String CuttEST = CurString.substring(0, CurString.length()-4);

if (CleanArray.contains(CutEST) == false & WordBase.contains(CutEST) == false){

// for regular case (great/greatest)

if (Bases[1].contains(CutEST) & CutEST.length() > 1){

FinalList.add(CutEST);

} else

if(CurString.length() > 5 & CurString.charAt(CurString.length()-4)

== CurString.charAt(CurString.length()-5) & WordBase.contains(CuttEST) == false

& Bases[1].contains(CuttEST)){

FinalList.add(CuttEST);

} else

if(CurString.charAt(CurString.length()-4) == (char) 'i'

& (CleanArray.contains(CuttEST+"y") || Bases[1].contains(CuttEST+"y"))){

FinalList.add(CuttEST+"y");

} else

FinalList.add(CurString);

}

//out(CurString);

} else

FinalList.add(CurString);

} else

FinalList.add(CurString);

}

Collections.sort(FinalList);

TotalCount = FinalList.size();

out("Count with duplicates: "+ TotalCount);

int k = 0;

int c = 1;

for(String word : FinalList){

// get statistics

if (k < FinalList.size()-1){

if (word.equals(FinalList.get(k + 1))) {

c++;

// if current and next word are equals

} else {

Object Table[] = new Object[3];

// if not - we done, now we need to save the count to the array

Table[1] = c;

Table[0] = word;

RatedFinalList.add(Table);

c = 1;

// reset count for new word

}

} else {

Object Table[] = new Object[3];

// if not - we done, now we need to save the count to the array

Table[1] = c;

Table[0] = word;

RatedFinalList.add(Table);

c = 1;

// reset count for new word

}

k++;

}

for(Object[] i : RatedFinalList){

// clean WordBase

String CurString = i[0].toString();

if(Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_ed"))

& WordBase.contains(CurString + CurString.substring(CurString.length()-1) + "ed")){

// update words like stopped, tarred

Replace(WordBase, CurString + CurString.substring(CurString.length()-1) + "ed", CurString);

} else

if (Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_ed")) & WordBase.contains(CurString+"d")){

Replace(WordBase, CurString+"d", CurString); // update word in WordBase, set word without ed there

} else

if (Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_s")) & WordBase.contains(CurString+"s")){

Replace(WordBase, CurString+"s", CurString); // update word in WordBase, set word without s there

} else

if (Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_s")) & WordBase.contains(CurString+"es")){

Replace(WordBase, CurString+"es", CurString); // update word in WordBase, set word without s there

} else

if (Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_ly")) & WordBase.contains(CurString+"ly")){

Replace(WordBase, CurString+"ly", CurString); // update word in WordBase, set word without ly there

} else

if (Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_ing")) & WordBase.contains(CurString+"ing")

|| WordBase.contains(CurString.substring(0, CurString.length()-1) + "ing")

|| WordBase.contains(CurString + CurString.charAt(CurString.length()-1) + "ing")){

String PlustIng = CurString + CurString.charAt(CurString.length()-1) + "ing";

// for words like getting

out(PlustIng);

if(CurString.endsWith("e")){

Replace(WordBase, CurString.substring(0, CurString.length()-1)+"ing", CurString);

} else

if(WordBase.contains(PlustIng)){

Replace(WordBase, PlustIng, CurString);

} else

Replace(WordBase, CurString+"ing", CurString); // update word in WordBase, set word without ing there

} else

if (Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_ly")) & WordBase.contains(CurString+"ally")){

Replace(WordBase, CurString+"ally", CurString); // update word in WordBase, set word without ly there

} else

if (Boolean.parseBoolean(Settings.GetProps(SettingFile, "F_ed")) & WordBase.contains(CurString+"ed")){

if(CleanArray.contains(CurString+"e") == false || Bases[1].contains(CurString+"e") == false){

Replace(WordBase, CurString+"ed", CurString);

// update word in WordBase, set word without d there for words like compare

}

}

}

out("Count after filters: "+RatedFinalList.size());

return RatedFinalList;

}

26

Соседние файлы в предмете [НЕСОРТИРОВАННОЕ]