hive取补集 excel取补集

转载

feiry 2023-07-12 21:48:22

文章标签 hive取补集 #include i++ 数据 文章分类 Hive 大数据

求两列数据交集的补集

一同学给我发来一段代码，是求出两列数据的交集的补集，要我看看有没有更高效的算法。

这两列数据（电话号码）是放在EXCEL 里的。于是在网上找了下，看看有没有EXCEL 里的公式

结果整出来个这个：

=INDEX($A:$A,SMALL(IF(COUNTIF($B$1:$B$40,$A$1:$A$40)=0,ROW($A$1:$A$40),65536),ROW(A1)))&""

从($B$1:$B$40 ,$A$1:$A$40)可以看出，这里只是查A，B 列的40行，要是想查更多的，自己改下就是了。

但是这个方法在数据量大的时候，就相当还靠谱了。

于是就写了如下的程序

/*
程序功能： 比较两列数据的差别，找出不同的数据项

*/
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>

using namespace std;


void readFile(char* fName,vector<unsigned>& colume)
{
    unsigned l = 0;

    ifstream file(fName);
    while( !file.eof() )
    {
        file >> l;
        colume.push_back(l);
    }

    sort(colume.begin(),colume.end()); 

    vector<unsigned> ::iterator end_unique = unique(colume.begin(),colume.end());
    colume.erase(end_unique,colume.end());

    file.close();
}

void writeFile( char* fName, vector<unsigned>& colume )
{
    ofstream file( fName );
    for(int i = 0 ; i < colume.size(); i++)
    {
    //    cout << colume[i] << endl;
        file << colume[i] << endl;
    
    }
    file.close();
}

void compare( vector<unsigned>& col_1, vector<unsigned>& col_2 )
{
    bool match;
    for( int i = 0; i < col_1.size(); i++)
    {
        match = 0; // 匹配元素
        for( int j = 0 ; j<col_2.size(); j++)
        {
            if( col_1[i] == col_2[j])
            {
                match = true;
                break;                        
            }

        }
        if( !match )
        {
            cout << col_1[i] << endl;
        }
    }
}

void compare2 (vector<unsigned>& col_1,vector<unsigned>& col_2 )
{
    vector<unsigned>::iterator iter_i = col_1.begin();
    vector<unsigned>::iterator iter_j = col_2.begin();

    for(int i=0,j=0; i<col_1.size() && j< col_2.size(); )
    {
        
        if( col_1[i] == col_2[j] )
        {
            iter_i = find(col_1.begin(),col_1.end(),col_1[i]);
            iter_j = find(col_2.begin(),col_2.end(),col_2[j]);

            col_1.erase(iter_i);
            col_2.erase(iter_j);

            continue;
        }
        else if ( col_1[i] > col_2[j] )
        {
            j++;
            continue;
        }
        else if( col_1[i] < col_2[j])
        {
            i++;
            continue;
        }
        
    }

}

int main()
{
    vector<unsigned> colume_1, colume_2;
    readFile("1.txt", colume_1);

    readFile("2.txt", colume_2);
    compare2( colume_1, colume_2);

    writeFile("3.txt",colume_1);
    writeFile("4.txt",colume_2);

    return 0;
}

其中compare 函数是用的两个for循环。而compare2 是改进过了的是用的摆动数组的思想。

写了一个生成随机手机号的函数，一会用来测试两个compare 方法的效率。

//Generate the Cell phone number
//first parameter : amount of the cell number your want to created.
//second parameter : writh the data to the file your specify
void numberGenerator(unsigned amount,char* fName)
{
    srand( (unsigned)time(NULL));
    ofstream file( fName );

    for(unsigned i= 0;i< amount;i++)
    {
     file <<"13";
     for(int j = 0;j<9;j++)
     {
        file<<rand()%10;        
     }
     file << endl;

    }
    file.close();
}

这里面就碰到了一个问题，那就是当我们生成的这个手机号是11位的，在用readfile 里把它读入到vector 时，没有达到我们想像中的结果

这时，存在vector 里的全都是0。但是，我们要是用的9位数的时候，确是正常的。可能是在 file >> l; 这条语句里有些字符的转换。

下面把改过之后的CODE 贴出来先

/*
程序功能： 比较两列数据的差别，找出不同的数据项

*/
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
#include <time.h>
#include <stdlib.h>   
#include <stdio.h> 
#include <windows.h>
#include <string>

using namespace std;



void readFile(char* fName,vector<string>& colume)
{
    string l = "";
    int count = 0;

    ifstream file(fName);
    while( !file.eof()  )  
    {
        file >> l;
        colume.push_back(l);
        //count ++;
    }
    //cout << " count in readfile is : " << count <<endl;

    sort(colume.begin(),colume.end()); 

    vector<string> ::iterator end_unique = unique(colume.begin(),colume.end());
    colume.erase(end_unique,colume.end());

    file.close();
}

void writeFile( char* fName, vector<string>& colume )
{
    ofstream file( fName );
    for(int i = 0 ; i < colume.size(); i++)
    {
    //    cout << colume[i] << endl;
        file << colume[i] << endl;
    
    }
    file.close();
}

void compare( vector<string>& col_1, vector<string>& col_2 ,vector<string>& col_3 )
{
    bool match;
    for( int i = 0; i < col_1.size(); i++)
    {
        match = 0; // 匹配元素
        for( int j = 0 ; j<col_2.size(); j++)
        {
            if( col_1[i] == col_2[j])
            {
                match = true;
                break;                        
            }

        }
        if( !match )
        {
            //cout << col_1[i] << endl;
            col_3.push_back(col_1[i]);
        }
    }
}

void compare2 (vector<string>& col_1,vector<string>& col_2)
{
    vector<string>::iterator iter_i = col_1.begin();
    vector<string>::iterator iter_j = col_2.begin();

    for(int i=0,j=0; i<col_1.size() && j< col_2.size(); )
    {
        
        if( col_1[i] == col_2[j] )
        {
            iter_i = find(col_1.begin(),col_1.end(),col_1[i]);
            iter_j = find(col_2.begin(),col_2.end(),col_2[j]);

            if(iter_i != col_1.end() && iter_j != col_2.end()) 
            {
            col_1.erase(iter_i);
            col_2.erase(iter_j);
            }
            continue;
        }
        else if ( col_1[i] > col_2[j] )
        {
            j++;
            continue;
        }
        else if( col_1[i] < col_2[j])
        {
            i++;
            continue;
        }
        
    }

}

//Generate the Cell phone number
//first parameter : amount of the cell number your want to created.
//second parameter : writh the data to the file your specify
void numberGenerator(unsigned amount,char* fName)
{
    srand( (unsigned)time(NULL));
    ofstream file( fName );

    for(unsigned i= 0;i< amount;i++)
    {
     file <<"13";
     for(int j = 0;j<9;j++)
     {
        file<<rand()%10;        
     }
     file << endl;

    }
    file.close();
}

int main()
{
    vector<string> colume_1, colume_2,colume_3,colume_4,colume_5;



    numberGenerator(10000,"1.txt");
    numberGenerator(10000,"2.txt");

    readFile("1.txt", colume_1);
    readFile("2.txt", colume_2);

    DWORD dwStartTick1 = GetTickCount();

    compare2( colume_1, colume_2);

    DWORD dwTimeElapsed1 = ((DWORD)GetTickCount() - dwStartTick1);

    writeFile("3.txt",colume_1);
    writeFile("4.txt",colume_2);


    readFile("1.txt", colume_4);
        readFile("2.txt", colume_5);

    DWORD dwStartTick2 = GetTickCount();

    compare(colume_4,colume_5, colume_3);

    DWORD dwTimeElapsed2 = ((DWORD)GetTickCount() - dwStartTick2)/1000;

    writeFile("5.txt",colume_3);

    cout << " compare2 algorithm elapsed time " <<  dwTimeElapsed1 << " millisecond ."<<endl;

    cout << " compare1 algorithm elapsed time " <<  dwTimeElapsed2 << " second ." << endl;

    

    return 0;
}

接下来想把那个compare2 方法给改写一下，因为之前我们在compare2 里有把数据从vector 里删掉，但是这样是没什么意义的

其实只要把不同的数据写入另一个vector 就可以了。

但是，就是这样做的时候，发现了一个问题：当往vector<string> 放在元素个数大于9996的时候，会出一个

Debug assertion failed. Expression: Vector subscript out of range

void compare3 (vector<string>& col_1,vector<string>& col_2,vector<string>& col_3)
{
    int count =0 ;
    for(int i=0,j=0; i<col_1.size() && j< col_2.size(); )
    {
        if(count > 9996 )
        {
            cout <<"the vector capacity is : "<< col_3.capacity();
          
        
        }

        if( col_1[i] == col_2[j] )
        {
            //iter_i = find(col_1.begin(),col_1.end(),col_1[i]);
            //iter_j = find(col_2.begin(),col_2.end(),col_2[j]);

            /*if(iter_i != col_1.end() && iter_j != col_2.end()) 
            {
            col_1.erase(iter_i);
            col_2.erase(iter_j);
            }*/
            
            i++;
            j++;

            continue;
        }
        else if ( col_1[i] > col_2[j] )
        {
            j++;
            count ++;
            col_3.push_back(col_2[j]);
            cout <<" current account is : "<< count << "\t" << col_2[j]<< endl;
            
            continue;
        }
        else if( col_1[i] < col_2[j])
        {
            i++;
            count ++;
            col_3.push_back(col_1[i]);
            cout <<" current account is : "<< count <<"\t" << col_1[i]<< endl;

            continue;
        }
        
    }

}

上面的问题是下标访问越界，很常见的错误，刚才居然没有发现，小小的BS自己一下下.

把 for(int i=0,j=0; i<col_1.size() && j< col_2.size(); ) 改成 for(int i=0,j=0; i<col_1.size()-1 && j< col_2.size()-1 ; )就可以了。

改过后的函数给贴出来

void compare3 (vector<string>& col_1,vector<string>& col_2,vector<string>& col_3)
{

    for(int i=0,j=0; i<col_1.size()-1 && j< col_2.size()-1; )
    {

        if( col_1[i] == col_2[j] )
        {
            i++;
            j++;
             continue;
        }
        else if ( col_1[i] > col_2[j] )
        {
            j++;
            col_3.push_back(col_2[j]);
            continue;
        }
        else if( col_1[i] < col_2[j])
        {
            i++;
            col_3.push_back(col_1[i]);

            continue;
        }
        
    }

}

那我们来比较一下这几个不同的方法

compare1 为用两个for 来做遍历的。compare2 为把相同的号码从自己的vector里删掉的。compare3为把不同的号码从vector 里拿出来考到一个新的vector 里的

测试数据

a) 用一个随机生成手机号的方法 13********* ，11位的手机号。各生成的5000 条放在1.txt 和 2.txt 。所以总共有10000 条数据，这两个文件里只有10个手机号是相同的。（极端情况下）

b) 用一个随机生成手机号的方法 13********* ，11位的手机号。各生成的5000 条放在1.txt 和 2.txt 。所以总共有10000 条数据，这两个文件里只有10个手机号是不相同的。（极端情况下）

测试结果

在测试数据a 的情况下 compare1 用时 1286063 毫秒，compare2 用时 297 毫秒， compare3 用时641毫秒。

在测试数据b 的情况下 compare1 用时 641625 毫秒，compare2 用时 1333657 毫秒， compare3 用时31毫秒。

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。